Skip to content

Commit fd39823

Browse files
committed
Update and bug fix
Update: PoGo is now compatible with Ensembl and Gencode annotations. Bug fix: reverse strand proteincoding features do not require CDS entries to be oriented based on translation direction but can be sorted in ascending coordinate direction as well.
1 parent 913a91a commit fd39823

6 files changed

Lines changed: 202 additions & 112 deletions

File tree

PoGo/src/Coordinates.h

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,11 @@ struct GenomeCoordinates : Coordinates {
9191
return true;
9292
}
9393
if (lhs.chr.getValue() == rhs.chr.getValue()) {
94-
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end >= rhs.start;
94+
if ((lhs.strand == Strand::rev && rhs.strand == Strand::rev) || (lhs.strand == Strand::unk && rhs.strand == Strand::rev) || (lhs.strand == Strand::rev && rhs.strand == Strand::unk)) {
95+
return lhs.start > rhs.start && lhs.end > rhs.end && lhs.start >= rhs.end;
96+
} else {
97+
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end <= rhs.start;
98+
}
9599
}
96100
return lhs.chr.getValue() < rhs.chr.getValue();
97101
}
@@ -102,10 +106,17 @@ struct GenomeCoordinates : Coordinates {
102106

103107
bool operator<(const GenomeCoordinates& rhs) const {
104108
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf == rhs.chrscaf) {
105-
if (start == rhs.start) {
106-
return end < rhs.end;
109+
if ((strand == Strand::rev && rhs.strand == Strand::rev) || (strand == Strand::unk && rhs.strand == Strand::rev) || (strand == Strand::rev && rhs.strand == Strand::unk)) {
110+
if (end == rhs.end) {
111+
return start > rhs.start;
112+
}
113+
return end > rhs.end;
114+
} else {
115+
if (start == rhs.start) {
116+
return end < rhs.end;
117+
}
118+
return start < rhs.start;
107119
}
108-
return start < rhs.start;
109120
}
110121
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf != rhs.chrscaf) {
111122
return chrscaf < rhs.chrscaf;
@@ -117,10 +128,17 @@ struct GenomeCoordinates : Coordinates {
117128
return true;
118129
}
119130
if (chr.getValue() == rhs.chr.getValue()) {
120-
if (start == rhs.start) {
121-
return end < rhs.end;
131+
if ((strand == Strand::rev && rhs.strand == Strand::rev) || (strand == Strand::unk && rhs.strand == Strand::rev) || (strand == Strand::rev && rhs.strand == Strand::unk)) {
132+
if (end == rhs.end) {
133+
return start > rhs.start;
134+
}
135+
return end > rhs.end;
136+
} else {
137+
if (start == rhs.start) {
138+
return end < rhs.end;
139+
}
140+
return start < rhs.start;
122141
}
123-
return start < rhs.start;
124142
}
125143
return chr.getValue() < rhs.chr.getValue();
126144
}

PoGo/src/GTFParser.cpp

Lines changed: 133 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ bool GTFParser::is_first_strand(std::vector<std::string> const& tokens) {
3434
return tokens.at(6).compare("+") == 0;
3535
}
3636

37+
bool GTFParser::is_first_strand(std::string const& token) {
38+
return token.compare("+") == 0;
39+
}
40+
41+
bool GTFParser::is_first_strand(Strand const& token) {
42+
return token == Strand::fwd;
43+
}
44+
3745
bool GTFParser::is_cds(std::vector<std::string> const& tokens) {
3846
return tokens.at(2).compare("CDS") == 0;
3947
}
@@ -50,6 +58,121 @@ bool GTFParser::is_next_gene(std::vector<std::string> const& tokens) {
5058
return tokens.at(2).compare("gene") == 0;
5159
}
5260

61+
void GTFParser::protein_exons_combine(CoordinateMapType & coordinates_map, std::list<GenomeCoordinates> & CDS_coords) {
62+
CDS_coords.sort(GenomeCoordinates());
63+
Coordinates protein_coordinates = Coordinates();
64+
Coordinates prev_proteint_coordinates = Coordinates();
65+
prev_proteint_coordinates = Coordinates();
66+
prev_proteint_coordinates.Cterm = off3;
67+
prev_proteint_coordinates.Nterm = off3;
68+
prev_proteint_coordinates.start = 0;
69+
prev_proteint_coordinates.end = 0;
70+
71+
for (std::list<GenomeCoordinates>::iterator it = CDS_coords.begin(); it != CDS_coords.end(); ++it) {
72+
protein_exons_combine(protein_coordinates, prev_proteint_coordinates, (*it), coordinates_map);
73+
}
74+
75+
}
76+
77+
void GTFParser::protein_exons_combine(Coordinates & protein_coordinates, Coordinates & prev_proteint_coordinates, GenomeCoordinates & genCoord, CoordinateMapType & coordinates_map) {
78+
79+
protein_coordinates = Coordinates();
80+
// get nterm from prev exon
81+
if (genCoord.frame != unknown) {
82+
protein_coordinates.Nterm = Offset(int(genCoord.frame));
83+
}
84+
else {
85+
if (prev_proteint_coordinates.Cterm != off3) {
86+
protein_coordinates.Nterm = Offset(3 - prev_proteint_coordinates.Cterm);
87+
}
88+
else {
89+
protein_coordinates.Nterm = off3;
90+
}
91+
}
92+
93+
int length = 0;
94+
95+
if (is_first_strand(genCoord.strand)) {
96+
length = genCoord.end - genCoord.start + 1;
97+
}
98+
else if (!is_first_strand(genCoord.strand)) {
99+
length = genCoord.start - genCoord.end + 1;
100+
}
101+
102+
// calc cterm
103+
if (length % 3 == 0) {
104+
if (protein_coordinates.Nterm != off3) {
105+
protein_coordinates.Cterm = Offset(3 - protein_coordinates.Nterm);
106+
}
107+
else {
108+
protein_coordinates.Cterm = off3;
109+
}
110+
}
111+
else if (length % 3 == 2) {
112+
if (protein_coordinates.Nterm == off3) {
113+
protein_coordinates.Cterm = off2;
114+
}
115+
else if (protein_coordinates.Nterm == off2) {
116+
protein_coordinates.Cterm = off3;
117+
}
118+
else if (protein_coordinates.Nterm == off1) {
119+
protein_coordinates.Cterm = off1;
120+
}
121+
}
122+
else if (length % 3 == 1) {
123+
if (protein_coordinates.Nterm == off3) {
124+
protein_coordinates.Cterm = off1;
125+
}
126+
else if (protein_coordinates.Nterm == off1) {
127+
protein_coordinates.Cterm = off3;
128+
}
129+
else if (protein_coordinates.Nterm == off2) {
130+
protein_coordinates.Cterm = off2;
131+
}
132+
}
133+
134+
// calc protein coordinates
135+
if (protein_coordinates.Nterm != off3) {
136+
protein_coordinates.start = prev_proteint_coordinates.end;
137+
}
138+
else {
139+
if (prev_proteint_coordinates.end == 0 && coordinates_map.empty()) {
140+
protein_coordinates.start = 0;
141+
}
142+
else {
143+
protein_coordinates.start = prev_proteint_coordinates.end + 1;
144+
}
145+
}
146+
147+
int offsets = 0;
148+
if (protein_coordinates.Nterm != off3) {
149+
offsets = offsets + protein_coordinates.Nterm;
150+
}
151+
152+
if (is_first_strand(genCoord.strand)) {
153+
length = genCoord.end - genCoord.start + 1 - offsets;
154+
}
155+
else if (!is_first_strand(genCoord.strand)) {
156+
length = genCoord.start - genCoord.end + 1 - offsets;
157+
}
158+
159+
int peplength = length / 3;
160+
161+
int pepend = protein_coordinates.start + peplength - 1;
162+
if (protein_coordinates.Cterm != off3) {
163+
pepend = pepend + 1;
164+
}
165+
if (protein_coordinates.Nterm != off3) {
166+
pepend = pepend + 1;
167+
}
168+
169+
protein_coordinates.end = pepend;
170+
171+
prev_proteint_coordinates = protein_coordinates;
172+
173+
coordinates_map.insert(CoordinateMapType::value_type(protein_coordinates, genCoord));
174+
}
175+
53176
assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrapper, MappedPeptides& mapping) {
54177
if (!open(file)) {
55178
throw GTFParser__file_not_found_exception();
@@ -61,10 +184,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
61184
Coordinates prev_proteint_coordinates = Coordinates();
62185
assembly assem = none;
63186
std::vector<std::string> tokens;
187+
std::list<GenomeCoordinates> CDS_coords;
64188
while (std::getline(m_ifstream, m_line)) {
65189
if ((m_line[0] != '#')) {
66190
tokenize(m_line, tokens, "\t");
67-
68191
if (is_next_gene(tokens)) {
69192
assembly assemtemp = mapping.add_gene_from_gtf(m_line);
70193
if (assem == none) {
@@ -74,6 +197,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
74197
}
75198
}
76199
if (is_next_transcript(tokens)) {
200+
if (p_protein_entry != nullptr && !CDS_coords.empty()) {
201+
protein_exons_combine(coordinates_map, CDS_coords);
202+
}
203+
CDS_coords.clear();
77204
mapping.add_transcript_id_to_gene(m_line);
78205
if (p_protein_entry != nullptr) {
79206
p_protein_entry->set_coordinate_map(coordinates_map);
@@ -83,12 +210,6 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
83210
std::cout << "ERROR: No entry for with transcript ID: " << GeneEntry::extract_transcript_id(m_line, GENOME_MAPPER_GLOBALS::ID::ID_VERSION_INCLUDE) << "\n";
84211
continue;
85212
}
86-
protein_coordinates = Coordinates();
87-
prev_proteint_coordinates = Coordinates();
88-
prev_proteint_coordinates.Cterm = off3;
89-
prev_proteint_coordinates.Nterm = off3;
90-
prev_proteint_coordinates.start = 0;
91-
prev_proteint_coordinates.end = 0;
92213
coordinates_map = CoordinateMapType();
93214
}
94215
else if (is_exon(tokens)) {
@@ -102,92 +223,15 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
102223
tmp_exonId = exonId;
103224
}
104225
genCoord.exonid = tmp_exonId;
105-
protein_coordinates = Coordinates();
106-
// get nterm from prev exon
107-
if (genCoord.frame != unknown) {
108-
protein_coordinates.Nterm = Offset(int(genCoord.frame));
109-
} else {
110-
if (prev_proteint_coordinates.Cterm != off3) {
111-
protein_coordinates.Nterm = Offset(3 - prev_proteint_coordinates.Cterm);
112-
} else {
113-
protein_coordinates.Nterm = off3;
114-
}
115-
}
116-
117-
int length = 0;
118-
119-
if (is_first_strand(tokens)) {
120-
length = genCoord.end - genCoord.start + 1;
121-
} else if (!is_first_strand(tokens)) {
122-
length = genCoord.start - genCoord.end + 1;
123-
}
124-
125-
// calc cterm
126-
if (length % 3 == 0) {
127-
if (protein_coordinates.Nterm != off3) {
128-
protein_coordinates.Cterm = Offset(3 - protein_coordinates.Nterm);
129-
} else {
130-
protein_coordinates.Cterm = off3;
131-
}
132-
} else if (length % 3 == 2) {
133-
if (protein_coordinates.Nterm == off3) {
134-
protein_coordinates.Cterm = off2;
135-
} else if (protein_coordinates.Nterm == off2) {
136-
protein_coordinates.Cterm = off3;
137-
} else if (protein_coordinates.Nterm == off1) {
138-
protein_coordinates.Cterm = off1;
139-
}
140-
} else if (length % 3 == 1) {
141-
if (protein_coordinates.Nterm == off3) {
142-
protein_coordinates.Cterm = off1;
143-
} else if (protein_coordinates.Nterm == off1) {
144-
protein_coordinates.Cterm = off3;
145-
} else if (protein_coordinates.Nterm == off2) {
146-
protein_coordinates.Cterm = off2;
147-
}
148-
}
149-
150-
// calc protein coordinates
151-
if (protein_coordinates.Nterm != off3) {
152-
protein_coordinates.start = prev_proteint_coordinates.end;
153-
} else {
154-
if (prev_proteint_coordinates.end == 0 && coordinates_map.empty()) {
155-
protein_coordinates.start = 0;
156-
} else {
157-
protein_coordinates.start = prev_proteint_coordinates.end + 1;
158-
}
159-
}
160-
161-
int offsets = 0;
162-
if (protein_coordinates.Nterm != off3) {
163-
offsets = offsets + protein_coordinates.Nterm;
164-
}
165-
166-
if (is_first_strand(tokens)) {
167-
length = genCoord.end - genCoord.start + 1 - offsets;
168-
} else if (!is_first_strand(tokens)) {
169-
length = genCoord.start - genCoord.end + 1 - offsets;
170-
}
171-
172-
int peplength = length / 3;
173-
174-
int pepend = protein_coordinates.start + peplength - 1;
175-
if (protein_coordinates.Cterm != off3) {
176-
pepend = pepend + 1;
177-
}
178-
if (protein_coordinates.Nterm != off3) {
179-
pepend = pepend + 1;
180-
}
181-
182-
protein_coordinates.end = pepend;
183-
184-
prev_proteint_coordinates = protein_coordinates;
185-
186-
coordinates_map.insert(CoordinateMapType::value_type(protein_coordinates, genCoord));
226+
CDS_coords.push_back(genCoord);
187227
}
188228
}
189229
tokens.clear();
190230
}
231+
if (p_protein_entry != nullptr && !CDS_coords.empty()) {
232+
protein_exons_combine(coordinates_map, CDS_coords);
233+
}
234+
CDS_coords.clear();
191235
if (p_protein_entry != nullptr) {
192236
p_protein_entry->set_coordinate_map(coordinates_map);
193237
}

PoGo/src/GTFParser.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,15 @@ class GTFParser {
3131
//closes the filestream
3232
void close();
3333

34+
void protein_exons_combine(Coordinates & protein_coordinates, Coordinates & prev_proteint_coordinates, GenomeCoordinates & genCoord, CoordinateMapType & coordinates_map);
35+
void protein_exons_combine(CoordinateMapType & coordinates_map, std::list<GenomeCoordinates> & CDS_coords);
36+
3437
//returns true if in the GTF at position 6 there is a + (plus strand)
3538
bool static is_first_strand(std::vector<std::string> const& tokens);
39+
//returns true if the token string is a + (plus strand)
40+
bool static is_first_strand(std::string const& token);
41+
//returns true if strand is fwd or 1
42+
bool static is_first_strand(Strand const& token);
3643
//returns true if position 2 in the GTF says "CDS"
3744
bool static is_cds(std::vector<std::string> const& tokens);
3845
//returns true if position 2 in the GTF says "exon"

PoGo/src/GeneEntry.cpp

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -59,36 +59,45 @@ bool GeneEntry::is_patchhaploscaff() {
5959
}
6060

6161
std::string GeneEntry::extract_gene_id(std::string gtfGeneLine, bool versionincl) {
62-
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID.length();
62+
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID);
6363
std::string value = "";
6464
if (index != std::string::npos) {
65-
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
66-
value = value + gtfGeneLine[index];
67-
index += 1;
65+
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID.length();
66+
if (index != std::string::npos) {
67+
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
68+
value = value + gtfGeneLine[index];
69+
index += 1;
70+
}
6871
}
6972
}
7073
return value;
7174
}
7275

7376
std::string GeneEntry::extract_transcript_id(std::string gtfGeneLine, bool versionincl) {
74-
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID.length();
77+
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID);
7578
std::string value = "";
7679
if (index != std::string::npos) {
77-
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
78-
value = value + gtfGeneLine[index];
79-
index += 1;
80+
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID.length();
81+
if (index != std::string::npos) {
82+
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
83+
value = value + gtfGeneLine[index];
84+
index += 1;
85+
}
8086
}
8187
}
8288
return value;
8389
}
8490

8591
std::string GeneEntry::extract_exon_id(std::string gtfGeneLine, bool versionincl) {
86-
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID.length();
92+
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID);
8793
std::string value = "";
8894
if (index != std::string::npos) {
89-
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
90-
value = value + gtfGeneLine[index];
91-
index += 1;
95+
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID.length();
96+
if (index != std::string::npos) {
97+
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
98+
value = value + gtfGeneLine[index];
99+
index += 1;
100+
}
92101
}
93102
}
94103
return value;

0 commit comments

Comments
 (0)