Skip to content

Commit bcdf271

Browse files
committed
major changes
Removal of species selection. Now supporting all species with peptide and gtf files available from Ensembl. Adding new parameter -genome that takes a genome fasta file (nucleotide sequences of chromosomes/scaffolds etc) to inform separation of chromosome (primary assembly) and scaffolds in the output files. If not provided the coordinates will be extracted solely from the GTF file. Bug fix for multiple input files: all peptides now mapped
1 parent 2499b31 commit bcdf271

21 files changed

Lines changed: 363 additions & 750 deletions

PoGo/src/Chromosome.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#include "Chromosome.h"
2+
3+
std::map<std::string, int> Chromosome::chr_to_int;
4+
std::map<int, std::string> Chromosome::int_to_chr;
5+
std::map<std::string, int> Chromosome::scaffold_names;
6+
7+
Chromosome::Chromosome(void) {
8+
this->name = "NA";
9+
}
10+
11+
Chromosome::Chromosome(std::string const & name) {
12+
this->name = name;
13+
}
14+
15+
Chromosome::~Chromosome(void) {
16+
17+
}
18+
19+
std::map<std::string, int>& Chromosome::getChr_to_int(void) {
20+
return chr_to_int;
21+
}
22+
23+
std::map<int, std::string>& Chromosome::getInt_to_chr(void) {
24+
return int_to_chr;
25+
}
26+
27+
std::map<std::string, int>& Chromosome::getScaffold_names(void) {
28+
return scaffold_names;
29+
}
30+
31+
std::string Chromosome::forValue(int const & value) {
32+
std::map<int, std::string>::iterator it = getInt_to_chr().find(value);
33+
if (it != getInt_to_chr().end()) {
34+
return it->second;
35+
}
36+
return "NA";
37+
}
38+
39+
int Chromosome::forName(std::string const & name) {
40+
std::map<std::string, int>::iterator it = getChr_to_int().find(name);
41+
if (it != getChr_to_int().end()) {
42+
return it->second;
43+
}
44+
return -1;
45+
}
46+
47+
void Chromosome::addChr(std::string const & name) {
48+
std::string tmpname = name;
49+
std::string substring = tmpname.substr(0, 3);
50+
if (substring == "chr" || substring == "Chr") {
51+
tmpname = tmpname.substr(3);
52+
}
53+
std::map<std::string, int>::iterator it = getChr_to_int().find(tmpname);
54+
if (it == getChr_to_int().end())
55+
{
56+
it = getScaffold_names().find(tmpname);
57+
if (it == getScaffold_names().end()) {
58+
int newrank = getChr_to_int().size() + 1;
59+
getChr_to_int().insert(std::pair<std::string, int>(tmpname, newrank));
60+
getInt_to_chr().insert(std::pair<int, std::string>(newrank, tmpname));
61+
if (tmpname == "M") {
62+
newrank = newrank + 1;
63+
getChr_to_int().insert(std::pair<std::string, int>("MT", newrank));
64+
getInt_to_chr().insert(std::pair<int, std::string>(newrank, "MT"));
65+
}
66+
else if (tmpname == "MT") {
67+
newrank = newrank + 1;
68+
getChr_to_int().insert(std::pair<std::string, int>("M", newrank));
69+
getInt_to_chr().insert(std::pair<int, std::string>(newrank, "M"));
70+
}
71+
}
72+
}
73+
}
74+
75+
void Chromosome::addScaffold(std::string const & name) {
76+
std::map<std::string, int>::iterator it = getChr_to_int().find("scaffold");
77+
if (it != getChr_to_int().end()) {
78+
int newrank = getChr_to_int().size() + 1;
79+
getChr_to_int().insert(std::pair<std::string, int>("scaffold", newrank));
80+
}
81+
}
82+
83+
int Chromosome::getValue(void) const {
84+
std::map<std::string, int>::iterator it = getChr_to_int().find(name);
85+
if (it != getChr_to_int().end()) {
86+
return it->second;
87+
}
88+
return -1;
89+
}
90+
91+
std::string Chromosome::getName(void) const {
92+
return name;
93+
}
94+
95+
void Chromosome::setName(std::string const& name) {
96+
this->name = name;
97+
}
98+
99+
bool Chromosome::isNA(void) const {
100+
return name == "NA";
101+
}
102+
103+
bool Chromosome::isScaffold(void) const {
104+
std::map<std::string, int>::iterator it = getScaffold_names().find(name);
105+
return it != getScaffold_names().end();
106+
}

PoGo/src/Chromosome.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#ifndef CHROMOSOME_H
2+
#define CHROMOSOME_H
3+
4+
#include "Globals.h"
5+
6+
class Chromosome {
7+
public:
8+
//ctr / dtr
9+
Chromosome(void);
10+
Chromosome(std::string const& name);
11+
~Chromosome(void);
12+
//end ctr / dtr
13+
14+
static std::string forValue(int const& value);
15+
static int forName(std::string const& name);
16+
static void addChr(std::string const& name);
17+
static void addScaffold(std::string const& name);
18+
19+
int getValue(void) const;
20+
std::string getName(void) const;
21+
void setName(std::string const& name);
22+
bool isNA(void) const;
23+
bool isScaffold(void) const;
24+
25+
private:
26+
//holds mapping of chromosome names to order values
27+
static std::map<std::string, int> chr_to_int;
28+
//holds mapping of order values to chromosome names
29+
static std::map<int, std::string> int_to_chr;
30+
//holds scaffold names
31+
static std::map<std::string, int> scaffold_names;
32+
33+
std::string name;
34+
35+
static std::map<std::string, int>& getChr_to_int(void);
36+
static std::map<int, std::string>& getInt_to_chr(void);
37+
static std::map<std::string, int>& getScaffold_names(void);
38+
};
39+
40+
#endif

PoGo/src/CoordinateWrapper.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ int CoordinateWrapper::size() const {
1111
return m_map.size();
1212
}
1313

14+
void CoordinateWrapper::renew() {
15+
m_existing_peptides->clear();
16+
delete(m_existing_peptides);
17+
m_existing_peptides = new ExistingPeptides();
18+
}
19+
1420
ProteinEntry& CoordinateWrapper::lookup_entry(std::string transcriptId) {
1521
return m_map[transcriptId];
1622
}

PoGo/src/CoordinateWrapper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class CoordinateWrapper {
2222
void add(ProteinEntry entry);
2323
//returns the number of ProteinEntry elements currently in the CorrdinateWrapper.
2424
int size() const;
25+
26+
void renew();
2527

2628
//reads and parses a fasta file and adds all of them to the CoordinateWrapper.
2729
void read_fasta_file(std::string file);

PoGo/src/Coordinates.h

Lines changed: 28 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef COORDINATES_H
22
#define COORDINATES_H
33

4-
#include "Globals.h"
4+
#include "Chromosome.h"
55

66
//possible offsets.
77
enum Offset {
@@ -10,107 +10,6 @@ enum Offset {
1010
off3 = 3
1111
};
1212

13-
//possible chromosomes
14-
enum Chromosome {
15-
chr1 = 1,
16-
chr1A = 2,
17-
chr1B = 3,
18-
chr2 = 4,
19-
chr2A = 5,
20-
chr2a = 6,
21-
chr2B = 7,
22-
chr2b = 8,
23-
chr3 = 9,
24-
chr4 = 10,
25-
chr4A = 11,
26-
chr5 = 12,
27-
chr6 = 13,
28-
chr7 = 14,
29-
chr8 = 15,
30-
chr9 = 16,
31-
chr10 = 17,
32-
chr11 = 18,
33-
chr12 = 19,
34-
chr13 = 20,
35-
chr14 = 21,
36-
chr15 = 22,
37-
chr16 = 23,
38-
chr17 = 24,
39-
chr18 = 25,
40-
chr19 = 26,
41-
chr20 = 27,
42-
chr21 = 28,
43-
chr22 = 29,
44-
chr23 = 30,
45-
chr24 = 31,
46-
chr25 = 32,
47-
chr26 = 33,
48-
chr27 = 34,
49-
chr28 = 35,
50-
chr29 = 36,
51-
chr30 = 37,
52-
chr31 = 38,
53-
chr32 = 39,
54-
chr33 = 40,
55-
chr34 = 41,
56-
chr35 = 42,
57-
chr36 = 43,
58-
chr37 = 44,
59-
chr38 = 45,
60-
chr39 = 46,
61-
chr40 = 47,
62-
chrI = 48,
63-
chrII = 49,
64-
chrIII = 50,
65-
chrIV = 51,
66-
chrV = 52,
67-
chrVI = 53,
68-
chrVII = 54,
69-
chrVIII = 55,
70-
chrIX = 56,
71-
chrX = 57,
72-
chrXI = 58,
73-
chrXII = 59,
74-
chrXIII = 60,
75-
chrXIV = 61,
76-
chrXV = 62,
77-
chrXVI = 63,
78-
chrY = 64,
79-
chrXY = 65,
80-
chrX1 = 66,
81-
chrX2 = 67,
82-
chrX3 = 68,
83-
chrX5 = 69,
84-
chrA1 = 70,
85-
chrA2 = 71,
86-
chrA3 = 72,
87-
chrB1 = 73,
88-
chrB2 = 74,
89-
chrB3 = 75,
90-
chrB4 = 76,
91-
chrC1 = 77,
92-
chrC2 = 78,
93-
chrD1 = 79,
94-
chrD2 = 80,
95-
chrD3 = 81,
96-
chrD4 = 82,
97-
chrE1 = 83,
98-
chrE2 = 84,
99-
chrE3 = 85,
100-
chrF1 = 86,
101-
chrF2 = 87,
102-
chrLG2 = 88,
103-
chrLG5 = 89,
104-
chrLGE22 = 90,
105-
chrW = 91,
106-
chrZ = 92,
107-
chrM = 93,
108-
chrMito = 94,
109-
chrLGE64 = 95,
110-
chrNA = -1,
111-
scaffold = 0
112-
};
113-
11413
//possible strands.
11514
enum Strand {
11615
fwd = 1,
@@ -137,6 +36,13 @@ struct Coordinates {
13736
//cterm offset (see enum Offset)
13837
Offset Cterm;
13938

39+
Coordinates() {
40+
start = 0;
41+
end = 0;
42+
Nterm = Offset::off3;
43+
Cterm = Offset::off3;
44+
}
45+
14046
// lt opeator. returns true if lhs.start is lesser than rhs.start and lhs.end is smaller than rhs.end and lhs.end is smaller than rhs.start
14147
//otherwise returns false.
14248
bool operator()(const Coordinates& lhs, const Coordinates& rhs) const {
@@ -164,52 +70,59 @@ struct GenomeCoordinates : Coordinates {
16470
//holds the frame.
16571
Frame frame;
16672

73+
GenomeCoordinates() : Coordinates() {
74+
chr = Chromosome();
75+
chrscaf = "";
76+
strand = Strand::unk;
77+
frame = Frame::unknown;
78+
}
79+
16780
bool operator()(const GenomeCoordinates& lhs, const GenomeCoordinates& rhs) const {
168-
if (lhs.chr == scaffold && rhs.chr == scaffold && lhs.chrscaf == rhs.chrscaf) {
81+
if (lhs.chr.isScaffold() && rhs.chr.isScaffold() && lhs.chrscaf == rhs.chrscaf) {
16982
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end >= rhs.start;
17083
}
171-
if (lhs.chr == scaffold && rhs.chr == scaffold && lhs.chrscaf != rhs.chrscaf) {
84+
if (lhs.chr.isScaffold() && rhs.chr.isScaffold() && lhs.chrscaf != rhs.chrscaf) {
17285
return lhs.chrscaf < rhs.chrscaf;
17386
}
174-
if (lhs.chr == scaffold && rhs.chr != scaffold) {
87+
if (lhs.chr.isScaffold() && !rhs.chr.isScaffold()) {
17588
return false;
17689
}
177-
if (lhs.chr != scaffold && rhs.chr == scaffold) {
90+
if (!lhs.chr.isScaffold() && rhs.chr.isScaffold()) {
17891
return true;
17992
}
180-
if (lhs.chr == rhs.chr) {
93+
if (lhs.chr.getValue() == rhs.chr.getValue()) {
18194
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end >= rhs.start;
18295
}
183-
return lhs.chr < rhs.chr;
96+
return lhs.chr.getValue() < rhs.chr.getValue();
18497
}
18598

18699
bool operator==(const GenomeCoordinates& rhs) const {
187-
return ((chr != scaffold && chr == rhs.chr) || (chr == scaffold && chrscaf == rhs.chrscaf)) && start >= rhs.start && end <= rhs.end;
100+
return ((!chr.isScaffold() && chr.getValue() == rhs.chr.getValue()) || (chr.isScaffold() && chrscaf == rhs.chrscaf)) && start >= rhs.start && end <= rhs.end;
188101
}
189102

190103
bool operator<(const GenomeCoordinates& rhs) const {
191-
if (chr == scaffold && rhs.chr == scaffold && chrscaf == rhs.chrscaf) {
104+
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf == rhs.chrscaf) {
192105
if (start == rhs.start) {
193106
return end < rhs.end;
194107
}
195108
return start < rhs.start;
196109
}
197-
if (chr == scaffold && rhs.chr == scaffold && chrscaf != rhs.chrscaf) {
110+
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf != rhs.chrscaf) {
198111
return chrscaf < rhs.chrscaf;
199112
}
200-
if (chr == scaffold && rhs.chr != scaffold) {
113+
if (chr.isScaffold() && !rhs.chr.isScaffold()) {
201114
return false;
202115
}
203-
if (chr != scaffold && rhs.chr == scaffold) {
116+
if (!chr.isScaffold() && rhs.chr.isScaffold()) {
204117
return true;
205118
}
206-
if (chr == rhs.chr) {
119+
if (chr.getValue() == rhs.chr.getValue()) {
207120
if (start == rhs.start) {
208121
return end < rhs.end;
209122
}
210123
return start < rhs.start;
211124
}
212-
return chr < rhs.chr;
125+
return chr.getValue() < rhs.chr.getValue();
213126
}
214127
};
215128

0 commit comments

Comments
 (0)