@@ -34,6 +34,14 @@ bool GTFParser::is_first_strand(std::vector<std::string> const& tokens) {
3434 return tokens.at (6 ).compare (" +" ) == 0 ;
3535}
3636
37+ bool GTFParser::is_first_strand (std::string const & token) {
38+ return token.compare (" +" ) == 0 ;
39+ }
40+
41+ bool GTFParser::is_first_strand (Strand const & token) {
42+ return token == Strand::fwd;
43+ }
44+
3745bool GTFParser::is_cds (std::vector<std::string> const & tokens) {
3846 return tokens.at (2 ).compare (" CDS" ) == 0 ;
3947}
@@ -50,6 +58,121 @@ bool GTFParser::is_next_gene(std::vector<std::string> const& tokens) {
5058 return tokens.at (2 ).compare (" gene" ) == 0 ;
5159}
5260
61+ void GTFParser::protein_exons_combine (CoordinateMapType & coordinates_map, std::list<GenomeCoordinates> & CDS_coords) {
62+ CDS_coords.sort (GenomeCoordinates ());
63+ Coordinates protein_coordinates = Coordinates ();
64+ Coordinates prev_proteint_coordinates = Coordinates ();
65+ prev_proteint_coordinates = Coordinates ();
66+ prev_proteint_coordinates.Cterm = off3;
67+ prev_proteint_coordinates.Nterm = off3;
68+ prev_proteint_coordinates.start = 0 ;
69+ prev_proteint_coordinates.end = 0 ;
70+
71+ for (std::list<GenomeCoordinates>::iterator it = CDS_coords.begin (); it != CDS_coords.end (); ++it) {
72+ protein_exons_combine (protein_coordinates, prev_proteint_coordinates, (*it), coordinates_map);
73+ }
74+
75+ }
76+
77+ void GTFParser::protein_exons_combine (Coordinates & protein_coordinates, Coordinates & prev_proteint_coordinates, GenomeCoordinates & genCoord, CoordinateMapType & coordinates_map) {
78+
79+ protein_coordinates = Coordinates ();
80+ // get nterm from prev exon
81+ if (genCoord.frame != unknown) {
82+ protein_coordinates.Nterm = Offset (int (genCoord.frame ));
83+ }
84+ else {
85+ if (prev_proteint_coordinates.Cterm != off3) {
86+ protein_coordinates.Nterm = Offset (3 - prev_proteint_coordinates.Cterm );
87+ }
88+ else {
89+ protein_coordinates.Nterm = off3;
90+ }
91+ }
92+
93+ int length = 0 ;
94+
95+ if (is_first_strand (genCoord.strand )) {
96+ length = genCoord.end - genCoord.start + 1 ;
97+ }
98+ else if (!is_first_strand (genCoord.strand )) {
99+ length = genCoord.start - genCoord.end + 1 ;
100+ }
101+
102+ // calc cterm
103+ if (length % 3 == 0 ) {
104+ if (protein_coordinates.Nterm != off3) {
105+ protein_coordinates.Cterm = Offset (3 - protein_coordinates.Nterm );
106+ }
107+ else {
108+ protein_coordinates.Cterm = off3;
109+ }
110+ }
111+ else if (length % 3 == 2 ) {
112+ if (protein_coordinates.Nterm == off3) {
113+ protein_coordinates.Cterm = off2;
114+ }
115+ else if (protein_coordinates.Nterm == off2) {
116+ protein_coordinates.Cterm = off3;
117+ }
118+ else if (protein_coordinates.Nterm == off1) {
119+ protein_coordinates.Cterm = off1;
120+ }
121+ }
122+ else if (length % 3 == 1 ) {
123+ if (protein_coordinates.Nterm == off3) {
124+ protein_coordinates.Cterm = off1;
125+ }
126+ else if (protein_coordinates.Nterm == off1) {
127+ protein_coordinates.Cterm = off3;
128+ }
129+ else if (protein_coordinates.Nterm == off2) {
130+ protein_coordinates.Cterm = off2;
131+ }
132+ }
133+
134+ // calc protein coordinates
135+ if (protein_coordinates.Nterm != off3) {
136+ protein_coordinates.start = prev_proteint_coordinates.end ;
137+ }
138+ else {
139+ if (prev_proteint_coordinates.end == 0 && coordinates_map.empty ()) {
140+ protein_coordinates.start = 0 ;
141+ }
142+ else {
143+ protein_coordinates.start = prev_proteint_coordinates.end + 1 ;
144+ }
145+ }
146+
147+ int offsets = 0 ;
148+ if (protein_coordinates.Nterm != off3) {
149+ offsets = offsets + protein_coordinates.Nterm ;
150+ }
151+
152+ if (is_first_strand (genCoord.strand )) {
153+ length = genCoord.end - genCoord.start + 1 - offsets;
154+ }
155+ else if (!is_first_strand (genCoord.strand )) {
156+ length = genCoord.start - genCoord.end + 1 - offsets;
157+ }
158+
159+ int peplength = length / 3 ;
160+
161+ int pepend = protein_coordinates.start + peplength - 1 ;
162+ if (protein_coordinates.Cterm != off3) {
163+ pepend = pepend + 1 ;
164+ }
165+ if (protein_coordinates.Nterm != off3) {
166+ pepend = pepend + 1 ;
167+ }
168+
169+ protein_coordinates.end = pepend;
170+
171+ prev_proteint_coordinates = protein_coordinates;
172+
173+ coordinates_map.insert (CoordinateMapType::value_type (protein_coordinates, genCoord));
174+ }
175+
53176assembly GTFParser::read (const std::string& file, CoordinateWrapper& coordwrapper, MappedPeptides& mapping) {
54177 if (!open (file)) {
55178 throw GTFParser__file_not_found_exception ();
@@ -61,10 +184,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
61184 Coordinates prev_proteint_coordinates = Coordinates ();
62185 assembly assem = none;
63186 std::vector<std::string> tokens;
187+ std::list<GenomeCoordinates> CDS_coords;
64188 while (std::getline (m_ifstream, m_line)) {
65189 if ((m_line[0 ] != ' #' )) {
66190 tokenize (m_line, tokens, " \t " );
67-
68191 if (is_next_gene (tokens)) {
69192 assembly assemtemp = mapping.add_gene_from_gtf (m_line);
70193 if (assem == none) {
@@ -74,6 +197,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
74197 }
75198 }
76199 if (is_next_transcript (tokens)) {
200+ if (p_protein_entry != nullptr && !CDS_coords.empty ()) {
201+ protein_exons_combine (coordinates_map, CDS_coords);
202+ }
203+ CDS_coords.clear ();
77204 mapping.add_transcript_id_to_gene (m_line);
78205 if (p_protein_entry != nullptr ) {
79206 p_protein_entry->set_coordinate_map (coordinates_map);
@@ -83,12 +210,6 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
83210 std::cout << " ERROR: No entry for with transcript ID: " << GeneEntry::extract_transcript_id (m_line, GENOME_MAPPER_GLOBALS::ID::ID_VERSION_INCLUDE) << " \n " ;
84211 continue ;
85212 }
86- protein_coordinates = Coordinates ();
87- prev_proteint_coordinates = Coordinates ();
88- prev_proteint_coordinates.Cterm = off3;
89- prev_proteint_coordinates.Nterm = off3;
90- prev_proteint_coordinates.start = 0 ;
91- prev_proteint_coordinates.end = 0 ;
92213 coordinates_map = CoordinateMapType ();
93214 }
94215 else if (is_exon (tokens)) {
@@ -102,92 +223,15 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
102223 tmp_exonId = exonId;
103224 }
104225 genCoord.exonid = tmp_exonId;
105- protein_coordinates = Coordinates ();
106- // get nterm from prev exon
107- if (genCoord.frame != unknown) {
108- protein_coordinates.Nterm = Offset (int (genCoord.frame ));
109- } else {
110- if (prev_proteint_coordinates.Cterm != off3) {
111- protein_coordinates.Nterm = Offset (3 - prev_proteint_coordinates.Cterm );
112- } else {
113- protein_coordinates.Nterm = off3;
114- }
115- }
116-
117- int length = 0 ;
118-
119- if (is_first_strand (tokens)) {
120- length = genCoord.end - genCoord.start + 1 ;
121- } else if (!is_first_strand (tokens)) {
122- length = genCoord.start - genCoord.end + 1 ;
123- }
124-
125- // calc cterm
126- if (length % 3 == 0 ) {
127- if (protein_coordinates.Nterm != off3) {
128- protein_coordinates.Cterm = Offset (3 - protein_coordinates.Nterm );
129- } else {
130- protein_coordinates.Cterm = off3;
131- }
132- } else if (length % 3 == 2 ) {
133- if (protein_coordinates.Nterm == off3) {
134- protein_coordinates.Cterm = off2;
135- } else if (protein_coordinates.Nterm == off2) {
136- protein_coordinates.Cterm = off3;
137- } else if (protein_coordinates.Nterm == off1) {
138- protein_coordinates.Cterm = off1;
139- }
140- } else if (length % 3 == 1 ) {
141- if (protein_coordinates.Nterm == off3) {
142- protein_coordinates.Cterm = off1;
143- } else if (protein_coordinates.Nterm == off1) {
144- protein_coordinates.Cterm = off3;
145- } else if (protein_coordinates.Nterm == off2) {
146- protein_coordinates.Cterm = off2;
147- }
148- }
149-
150- // calc protein coordinates
151- if (protein_coordinates.Nterm != off3) {
152- protein_coordinates.start = prev_proteint_coordinates.end ;
153- } else {
154- if (prev_proteint_coordinates.end == 0 && coordinates_map.empty ()) {
155- protein_coordinates.start = 0 ;
156- } else {
157- protein_coordinates.start = prev_proteint_coordinates.end + 1 ;
158- }
159- }
160-
161- int offsets = 0 ;
162- if (protein_coordinates.Nterm != off3) {
163- offsets = offsets + protein_coordinates.Nterm ;
164- }
165-
166- if (is_first_strand (tokens)) {
167- length = genCoord.end - genCoord.start + 1 - offsets;
168- } else if (!is_first_strand (tokens)) {
169- length = genCoord.start - genCoord.end + 1 - offsets;
170- }
171-
172- int peplength = length / 3 ;
173-
174- int pepend = protein_coordinates.start + peplength - 1 ;
175- if (protein_coordinates.Cterm != off3) {
176- pepend = pepend + 1 ;
177- }
178- if (protein_coordinates.Nterm != off3) {
179- pepend = pepend + 1 ;
180- }
181-
182- protein_coordinates.end = pepend;
183-
184- prev_proteint_coordinates = protein_coordinates;
185-
186- coordinates_map.insert (CoordinateMapType::value_type (protein_coordinates, genCoord));
226+ CDS_coords.push_back (genCoord);
187227 }
188228 }
189229 tokens.clear ();
190230 }
231+ if (p_protein_entry != nullptr && !CDS_coords.empty ()) {
232+ protein_exons_combine (coordinates_map, CDS_coords);
233+ }
234+ CDS_coords.clear ();
191235 if (p_protein_entry != nullptr ) {
192236 p_protein_entry->set_coordinate_map (coordinates_map);
193237 }
0 commit comments