From 14746c9ff3db6498adf0b8ade2ae4cb2d85300a0 Mon Sep 17 00:00:00 2001 From: Dmitriy Samborskiy Date: Wed, 5 Jun 2019 00:54:23 +0300 Subject: [PATCH 1/5] Allocate large enough backtrace i/j/state arrays in PosteriorDecoder::initializeBacktrace --- src/hhbacktracemac.cpp | 11 ++++++----- src/hhposteriordecoder.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/hhbacktracemac.cpp b/src/hhbacktracemac.cpp index 9c1a4a19..1fd6caca 100644 --- a/src/hhbacktracemac.cpp +++ b/src/hhbacktracemac.cpp @@ -116,7 +116,7 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi int step; // counts steps in path through 5-layered dynamic programming matrix int i,j; // query and template match state indices - initializeBacktrace(t,hit); + initializeBacktrace(q,t,hit); // Make sure that backtracing stops when t:M1 or q:M1 is reached (Start state), e.g. sMM[i][1], or sIM[i][1] (M:MM, B:IM) for (i = 0; i <= q.L; ++i) backtrace_matrix.setMatMat(i, 1, elem, ViterbiMatrix::STOP); // b[i][1] = STOP; @@ -270,22 +270,23 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi ///////////////////////////////////////////////////////////////////////////////////// // Allocate memory for data of new alignment (sequence names, alignment, scores,...) ///////////////////////////////////////////////////////////////////////////////////// -void PosteriorDecoder::initializeBacktrace(HMM & t, Hit & hit) { +void PosteriorDecoder::initializeBacktrace(HMM & q, HMM & t, Hit & hit) { + int maxlen = q.L + t.L + 1; // Allocate new space if(hit.i) { delete[] hit.i; } - hit.i = new int[hit.i2 + hit.j2 + 2]; + hit.i = new int[maxlen]; if(hit.j) { delete[] hit.j; } - hit.j = new int[hit.i2 + hit.j2 + 2]; + hit.j = new int[maxlen]; if(hit.states) { delete[] hit.states; } - hit.states = new char[hit.i2 + hit.j2 + 2]; + hit.states = new char[maxlen]; if(hit.S) { delete[] hit.S; diff --git a/src/hhposteriordecoder.h b/src/hhposteriordecoder.h index 67493ad1..737ce017 100644 --- a/src/hhposteriordecoder.h +++ b/src/hhposteriordecoder.h @@ -127,7 +127,7 @@ class PosteriorDecoder { ViterbiMatrix & viterbi_matrix, float par_mact, const int elem); void backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr); void writeProfilesToHits(HMM &q, HMM &t, PosteriorMatrix &p_mm, ViterbiMatrix & backtrace_matrix, Hit &hit); - void initializeBacktrace(HMM & t, Hit & hit); + void initializeBacktrace(HMM & q, HMM & t, Hit & hit); void initializeForAlignment(HMM &q, HMM &t, Hit &hit, ViterbiMatrix &viterbi_matrix, const int elem, const int t_max_L, int par_min_overlap); void maskViterbiAlignment(const int q_length, const int t_length, ViterbiMatrix &celloff_matrix, From b9b24dac082a27a0c26589d46910223567020f17 Mon Sep 17 00:00:00 2001 From: Dmitriy Samborskiy Date: Wed, 3 Jul 2019 15:42:45 +0300 Subject: [PATCH 2/5] backtraceMAC - fix errors in MAC algorithm This commit introduces two checks in the MAC realignment algorithm in order to alleviate Issue #153 ("MAC realignment algorithm damages hits", https://github.com/soedinglab/hh-suite/issues/153): 1. If MAC hit is significantly shorter than the Viterbi hit in terms of number of matched columns then the original Viterbi hit is restored and an appropriate warning is issued. This happens when MAC algorithm generates several disjoint local hits or when MAC hit is shorter due to weak score at the start/end regions of the Viterbi hit. The threshold on the MAC hit length is defined by a new option, -mac_min_length (default=0.9, i.e. at least 90% of number of matched columns in Viterbi hit must be in the corresponding MAC hit). There is no obvious way to deal with hit split in the current MAC algorithm aproach, it is expected that the algorithm should find the same Viterbi hit albeit improved and extended at its borders. Usually the MAC hit restores its integrity as soon as the -mact option is decreased. For example, in the test case described in the issue report one could find the proper MAC hit if "-mact 0.33" is used (instead of the default "-mact 0.35" value). Therefore the warning also suggests decreasing -mact option as a viable solution if MAC alignment is still needed. Note: It would be nice to have support for "-mact auto" option that forced the MAC algorithm to decrease -mact to the max. value that restores the hit integrity (i.e. match the original Viterbi hit to the degree defined by -mac_min_length option). 2. MAC realignment algorithm also can find a local hit that is completely disjoint from the original Viterbi hit. This possibility follows from the fact that viterbi_matrix used in MAC DP algorithm is masked (see maskViterbiAlignment()) in a way that allows extension of the hit. This commit adds a check that the found MAC hit has intersection with Viterbi hit in both, query and target, coordinate spaces. If the check fails, the alternative MAC hit that satisfies this condition is sought (in decreasing score order). When no valid MAC hit is found, the original Viterbi hit is restored and an appropriate warning is printed. This error was also observed in the reported issue, see the hit #4 (Score=11.3) that was mapped erroneously to the second part of main hit, template columns [261; 357]. The alternative MAC hits' positions are collected in macAlgorithm() procedure and later used in backtraceMAC() procedure. --- src/hhalign.cpp | 5 + src/hhbacktracemac.cpp | 177 +++++++++++++++++++++++-------- src/hhblits.cpp | 5 + src/hhdecl.cpp | 1 + src/hhdecl.h | 1 + src/hhhit.h | 34 ++++++ src/hhmacalgorithm.cpp | 11 ++ src/hhposteriordecoder.cpp | 91 +++++++++++++++- src/hhposteriordecoder.h | 9 +- src/hhposteriordecoderrunner.cpp | 2 +- src/hhsearch.cpp | 5 + 11 files changed, 292 insertions(+), 49 deletions(-) diff --git a/src/hhalign.cpp b/src/hhalign.cpp index bf2b74e3..d1cb61a7 100755 --- a/src/hhalign.cpp +++ b/src/hhalign.cpp @@ -111,6 +111,8 @@ void HHalign::help(Parameters& par, char all) { printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n"); printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n"); printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact); + printf(" -mac_min_length minimum length of MAC hit in comparison with the Viterbi hit\n"); + printf(" (in terms of matched_cols, default=%.2f)\n", par.mac_min_length); printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n"); if (all) { @@ -518,6 +520,9 @@ void HHalign::ProcessArguments(Parameters& par) { else if (!strcmp(argv[i], "-mact") && (i < argc - 1)) { par.mact = atof(argv[++i]); } + else if (!strcmp(argv[i], "-mac_min_length") + && (i < argc - 1)) + par.mac_min_length = atof(argv[++i]); //not in the help - but intended else if (!strcmp(argv[i], "-scwin") && (i < argc - 1)) { par.columnscore = 5; diff --git a/src/hhbacktracemac.cpp b/src/hhbacktracemac.cpp index 1fd6caca..f00ec9cc 100644 --- a/src/hhbacktracemac.cpp +++ b/src/hhbacktracemac.cpp @@ -108,7 +108,64 @@ void PosteriorDecoder::writeProfilesToHits(HMM &q, HMM &t, PosteriorMatrix &p_mm } } -void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr) { +void PosteriorDecoder::findBacktraceMAC(ViterbiMatrix & backtrace_matrix, const int elem, + int & i1, int & j1, int i2, int j2, std::vector & vi, std::vector & vj, std::vector & states, + int & matched_cols, int & nsteps, + LogLevel & actual_level) { + // Back-tracing loop + // In contrast to the Viterbi-Backtracing, STOP signifies the first Match-Match state, NOT the state before the first MM state + matched_cols = 1; // for each MACTH (or STOP) state matched_col is incremented by 1 + char state = ViterbiMatrix::MM; // lowest state with maximum score must be match-match state + int step = 0; // steps through the matrix correspond to alignment columns (from 1 to nsteps) + int i = i2; // last aligned pair is (i2,j2) + int j = j2; + + // arrays start from 1 + vi.push_back(0); + vj.push_back(0); + states.push_back((char) ViterbiMatrix::STOP); // note: static cast is needed for avoiding "undefined" ViterbiMatrix::MM object error + + if (backtrace_matrix.getMatMat(i, j, elem) != ViterbiMatrix::MM) { // b[i][j] != MM + if (Log::reporting_level() > DEBUG) + fprintf(stderr,"Error: backtrace does not start in match-match state, but in state %i, (i,j)=(%i,%i)\n",backtrace_matrix.getMatMat(i, j, elem),i,j); + + step = 0; + vi[0] = i; + vj[0] = j; + state = ViterbiMatrix::STOP; + } else { + while (state != ViterbiMatrix::STOP) { + step++; + state = backtrace_matrix.getMatMat(i, j, elem); // b[i][j]; + states.push_back(state); + vi.push_back(i); + vj.push_back(j); + + if (state == ViterbiMatrix::MM) matched_cols++; + + switch (state) { + case ViterbiMatrix::MM: i--; j--; break; + case ViterbiMatrix::IM: j--; break; + case ViterbiMatrix::MI: i--; break; + case ViterbiMatrix::STOP: break; + default: + fprintf(stderr,"Error: unallowed state value %i occurred during backtracing at step %i, (i,j)=(%i,%i)\n", state, step, i, j); + state = 0; + actual_level = DEBUG1; + break; + } //end switch (state) + } //end while (state) + } + // first state (STOP state) is set to MM state + states.push_back((char) ViterbiMatrix::MM); // note: static cast is needed for avoiding "undefined" ViterbiMatrix::MM object error + nsteps = step; + i1 = i2; + j1 = j2; + i1 = vi[nsteps]; + j1 = vj[nsteps]; +} + +void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr, float mac_min_length) { // Trace back trough the matrix b[i][j] until STOP state is found @@ -122,56 +179,88 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi for (i = 0; i <= q.L; ++i) backtrace_matrix.setMatMat(i, 1, elem, ViterbiMatrix::STOP); // b[i][1] = STOP; for (j = 1; j <= t.L; ++j) backtrace_matrix.setMatMat(1, j, elem, ViterbiMatrix::STOP); // b[1][j] = STOP; - // Back-tracing loop - // In contrast to the Viterbi-Backtracing, STOP signifies the first Match-Match state, NOT the state before the first MM state - hit.matched_cols = 1; // for each MACTH (or STOP) state matched_col is incremented by 1 - hit.state = ViterbiMatrix::MM; // lowest state with maximum score must be match-match state - step = 0; // steps through the matrix correspond to alignment columns (from 1 to nsteps) - i = hit.i2; j = hit.j2; // last aligned pair is (i2,j2) - if (backtrace_matrix.getMatMat(i, j, elem) != ViterbiMatrix::MM) { // b[i][j] != MM - if (Log::reporting_level() > DEBUG) - fprintf(stderr,"Error: backtrace does not start in match-match state, but in state %i, (i,j)=(%i,%i)\n",backtrace_matrix.getMatMat(i, j, elem),i,j); + std::vector vi; + std::vector vj; + std::vector states; + int matched_cols, nsteps; + int i2 = hit.i2; + int j2 = hit.j2; + int i1, j1; + + findBacktraceMAC(backtrace_matrix, elem, i1, j1, i2, j2, vi, vj, states, matched_cols, nsteps, actual_level); + + char msg[512]; + if (i2 < m_temp_hit->i1 || j2 < m_temp_hit->j1 || i1 > m_temp_hit->i2 || j1 > m_temp_hit->j2) { + snprintf(msg, sizeof(msg), "The main MAC hit (%d,%d)-(%d,%d) is outside the Viterbi hit scope (%d,%d)-(%d,%d), looking for the alternative...\n", + i1, j1, i2, j2, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); + HH_LOG(WARNING) << msg; + // find first hit from mac_hit_end_positions which overlaps with Viterbi hit + int no = 1; + int found = 0; + int i2_0 = i2, j2_0 = j2; + for(std::multimap >::iterator it = hit.mac_hit_end_positions.begin(); + it != hit.mac_hit_end_positions.end(); it++, no++) { + double score = -it->first; + i2 = it->second.first; + j2 = it->second.second; + vi.clear(); + vj.clear(); + states.clear(); + findBacktraceMAC(backtrace_matrix, elem, i1, j1, i2, j2, vi, vj, states, matched_cols, nsteps, actual_level); + if (!(i2 < m_temp_hit->i1 || j2 < m_temp_hit->j1 || i1 > m_temp_hit->i2 || j1 > m_temp_hit->j2)) { + snprintf(msg, sizeof(msg), "Found the alternative MAC hit #%d (%d,%d)-(%d,%d) (score=%f) inside the Viterbi hit scope (%d,%d)-(%d,%d)\n", + no, i1, j1, i2, j2, score, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); + HH_LOG(WARNING) << msg; + found = no; + break; + } + } + if (found == 0) { + nsteps = 0; + } + } + int mac_is_valid = 1; + if (matched_cols < hit.matched_cols * mac_min_length) { + snprintf(msg, sizeof(msg), "MAC alignment is too short, matched_cols=%d, the threshold is %f of the original matched_cols=%d: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", matched_cols, mac_min_length, hit.matched_cols, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); + HH_LOG(WARNING) << msg; + mac_is_valid = 0; + } + if (!nsteps) { + snprintf(msg, sizeof(msg), "Could not find valid MAC alternative hit for the original Viterbi hit scope: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); + HH_LOG(WARNING) << msg; + mac_is_valid = 0; + } + if (!mac_is_valid) { + restoreHitPath(hit); + return; + } - step = 0; + // register the checked hit + for (step = 1; step <= nsteps; step++) { + hit.states[step] = hit.state = states[step]; + i = vi[step]; j = vj[step]; hit.i[step] = i; hit.j[step] = j; hit.alt_i->push_back(i); hit.alt_j->push_back(j); + // Exclude cells in direct neighbourhood from all further alignments + for (int ii = imax(i-2,1); ii <= imin(i+2, q.L); ++ii) + backtrace_matrix.setCellOff(ii, j, elem, true); + for (int jj = imax(j-2,1); jj <= imin(j+2, t.L); ++jj) + backtrace_matrix.setCellOff(i, jj, elem, true); + } + if (nsteps == 0) { + hit.i[0] = vi[0]; + hit.j[0] = vj[0]; + hit.alt_i->push_back(vi[0]); + hit.alt_j->push_back(vj[0]); hit.state = ViterbiMatrix::STOP; - } else { - while (hit.state != ViterbiMatrix::STOP) { - step++; - hit.states[step] = hit.state = backtrace_matrix.getMatMat(i, j, elem); // b[i][j]; - hit.i[step] = i; - hit.j[step] = j; - hit.alt_i->push_back(i); - hit.alt_j->push_back(j); - // Exclude cells in direct neighbourhood from all further alignments - for (int ii = imax(i-2,1); ii <= imin(i+2, q.L); ++ii) -// hit.cell_off[ii][j] = 1; - backtrace_matrix.setCellOff(ii, j, elem, true); - for (int jj = imax(j-2,1); jj <= imin(j+2, t.L); ++jj) - backtrace_matrix.setCellOff(i, jj, elem, true); - - if (hit.state == ViterbiMatrix::MM) hit.matched_cols++; - - switch (hit.state) { - case ViterbiMatrix::MM: i--; j--; break; - case ViterbiMatrix::IM: j--; break; - case ViterbiMatrix::MI: i--; break; - case ViterbiMatrix::STOP: break; - default: - fprintf(stderr,"Error: unallowed state value %i occurred during backtracing at step %i, (i,j)=(%i,%i)\n", hit.state, step, i, j); - hit.state = 0; - actual_level = DEBUG1; - break; - } //end switch (state) - } //end while (state) } - hit.i1 = hit.i[step]; - hit.j1 = hit.j[step]; - hit.states[step] = ViterbiMatrix::MM; // first state (STOP state) is set to MM state - hit.nsteps = step; + hit.i1 = hit.i[nsteps]; + hit.j1 = hit.j[nsteps]; + hit.states[nsteps] = ViterbiMatrix::MM; // first state (STOP state) is set to MM state + hit.nsteps = nsteps; + hit.matched_cols = matched_cols; // Allocate new space for alignment scores hit.S = new float[hit.nsteps+1]; diff --git a/src/hhblits.cpp b/src/hhblits.cpp index 1a20dac7..583218a5 100644 --- a/src/hhblits.cpp +++ b/src/hhblits.cpp @@ -292,6 +292,8 @@ void HHblits::help(Parameters& par, char all) { printf(" -realign_old_hits realign hits from previous iterations \n"); printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n"); printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact); + printf(" -mac_min_length minimum length of MAC hit in comparison with the Viterbi hit\n"); + printf(" (in terms of matched_cols, default=%.2f)\n", par.mac_min_length); printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n"); if (all) { printf(" -realign realign displayed hits with max. accuracy (MAC) algorithm \n"); @@ -762,6 +764,9 @@ void HHblits::ProcessArguments(Parameters& par) { else if ((!strcmp(argv[i], "-mact") || !strcmp(argv[i], "-mapt")) && (i < argc - 1)) par.mact = atof(argv[++i]); + else if (!strcmp(argv[i], "-mac_min_length") + && (i < argc - 1)) + par.mac_min_length = atof(argv[++i]); else if (!strcmp(argv[i], "-sc") && (i < argc - 1)) par.columnscore = atoi(argv[++i]); //no help required diff --git a/src/hhdecl.cpp b/src/hhdecl.cpp index ef355046..8a5723d3 100755 --- a/src/hhdecl.cpp +++ b/src/hhdecl.cpp @@ -85,6 +85,7 @@ Parameters::Parameters(const int argc, const char** argv) : argc(argc), argv(arg ssa = 1.0f; // weight of ss evolution matrix shift = -0.03f; // Shift match score up mact = 0.3501f; // Probability threshold for MAC alignment in local mode for alignment ends (set to 0.3501 to track user modification) + mac_min_length = 0.9f; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols) corr = 0.1f; // Weight of correlations of scores for |i-j|<=4 egq = 0.0f; // no charge for end gaps as default diff --git a/src/hhdecl.h b/src/hhdecl.h index e9265b35..04854e1b 100644 --- a/src/hhdecl.h +++ b/src/hhdecl.h @@ -239,6 +239,7 @@ class Parameters // Parameters for gap penalties and pseudocounts float corr; // Weight of correlations between scores with |i-j|<=4 float shift; // Score offset for match-match states double mact; // Probability threshold (negative offset) in MAC alignment determining greediness at ends of alignment + float mac_min_length; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols) int realign_max; // Realign max ... hits float maxmem; // maximum available memory in GB for realignment (approximately) diff --git a/src/hhhit.h b/src/hhhit.h index 6c9d259e..9dc73045 100755 --- a/src/hhhit.h +++ b/src/hhhit.h @@ -12,6 +12,7 @@ #include #include #include +#include class Hit; @@ -138,6 +139,39 @@ class Hit char state; // 0: Start/STOP state 1: MM state 2: GD state (-D) 3: IM state 4: DG state (D-) 5 MI state int min_overlap; + /* macAlgorithm() method stores here all the possible MAC path end positions (it's needed by backtraceMAC()) */ + std::multimap > mac_hit_end_positions; + int max_mac_hit_end_positions; + + void initMACHitEndPositions(int max) { + if (mac_hit_end_positions.size() > 0) { + mac_hit_end_positions.erase(mac_hit_end_positions.begin(), mac_hit_end_positions.end()); + } + max_mac_hit_end_positions = max; + } + + void storeMACHitEndPosition(double score, int i2, int j2) { + std::pair p = std::pair(i2, j2); + mac_hit_end_positions.insert(std::pair >(-score, p)); + // store no more than max_mac_hit_end_positions for the best MAC hits + while (mac_hit_end_positions.size() > max_mac_hit_end_positions) { + mac_hit_end_positions.erase(std::next(mac_hit_end_positions.rbegin()).base()); + } + } + + void eraseMACHitEndPosition(double score, int i2, int j2) { + typedef std::multimap >::iterator mapIterator; + + std::pair its = mac_hit_end_positions.equal_range(-score); + std::pair p = std::pair(i2, j2); + for(mapIterator it = its.first; it != its.second; it++) { + if (it->second == p) { + mac_hit_end_positions.erase(it); + return; + } + } + } + private: // Calculate probability of true positive : p_TP(score)/( p_TP(score)+p_FP(score) ) // TP: same superfamily OR MAXSUB score >=0.1 diff --git a/src/hhmacalgorithm.cpp b/src/hhmacalgorithm.cpp index f053d1b9..3f7feb9a 100644 --- a/src/hhmacalgorithm.cpp +++ b/src/hhmacalgorithm.cpp @@ -50,6 +50,7 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit, // char * c_ptr = new char; char val; hit.min_overlap = 0; + hit.initMACHitEndPositions(imax(q.L, t.L)); // reasonable limit on the number of alternative MAC hits // Dynamic programming for (i = 1; i <= q.L; ++i) { // Loop through query positions i // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues @@ -128,6 +129,13 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit, hit.i2 = i; hit.j2 = j; score_MAC = S_curr[j]; } + if((m_local || i == q.L) && val == ViterbiMatrix::MM) { // local alignment path ends with MM state + hit.storeMACHitEndPosition(S_curr[j], i, j); + if (val == ViterbiMatrix::MM) { + // delete suboptimal previous position + hit.eraseMACHitEndPosition(S_prev[j-1], i - 1, j - 1); + } + } } // end if @@ -145,6 +153,9 @@ void PosteriorDecoder::macAlgorithm(HMM & q, HMM & t, Hit & hit, hit.j2 = jmax; score_MAC = S_curr[jmax]; } + if (!m_local) { + hit.storeMACHitEndPosition(S_curr[jmax], i, jmax); + } for (j = 0; j <= t.L; ++j) S_prev[j] = S_curr[j]; diff --git a/src/hhposteriordecoder.cpp b/src/hhposteriordecoder.cpp index ce37ff84..9fed0cb4 100644 --- a/src/hhposteriordecoder.cpp +++ b/src/hhposteriordecoder.cpp @@ -86,7 +86,7 @@ PosteriorDecoder::~PosteriorDecoder() { void PosteriorDecoder::realign(HMM &q, HMM &t, Hit &hit, PosteriorMatrix &p_mm, ViterbiMatrix &viterbi_matrix, std::vector alignment_to_exclude, - char * exclstr, char* template_exclstr, int par_min_overlap, float shift, float mact, float corr) { + char * exclstr, char* template_exclstr, int par_min_overlap, float shift, float mact, float corr, float mac_min_length) { HMM & curr_q_hmm = q; HMM & curr_t_hmm = t; @@ -112,7 +112,7 @@ void PosteriorDecoder::realign(HMM &q, HMM &t, Hit &hit, backwardAlgorithm(curr_q_hmm, curr_t_hmm, hit, p_mm, viterbi_matrix, shift, 0); macAlgorithm(curr_q_hmm, curr_t_hmm, hit, p_mm, viterbi_matrix, mact, 0); - backtraceMAC(curr_q_hmm, curr_t_hmm, p_mm, viterbi_matrix, 0, hit, corr); + backtraceMAC(curr_q_hmm, curr_t_hmm, p_mm, viterbi_matrix, 0, hit, corr, mac_min_length); restoreHitValues(hit); writeProfilesToHits(curr_q_hmm, curr_t_hmm, p_mm, viterbi_matrix, hit); // add result to exclution paths (needed to align 2nd, 3rd, ... best alignment) @@ -279,6 +279,41 @@ void PosteriorDecoder::memorizeHitValues(Hit &curr_hit) { m_temp_hit->Eval = curr_hit.Eval; m_temp_hit->logEval = curr_hit.logEval; m_temp_hit->Probab = curr_hit.Probab; + m_temp_hit->i1 = curr_hit.i1; + m_temp_hit->i2 = curr_hit.i2; + m_temp_hit->j1 = curr_hit.j1; + m_temp_hit->j2 = curr_hit.j2; + m_temp_hit->nsteps = curr_hit.nsteps; + if (curr_hit.i) { + m_temp_hit->i = new int[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + m_temp_hit->i[step] = curr_hit.i[step]; + } + } + if (curr_hit.j) { + m_temp_hit->j = new int[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + m_temp_hit->j[step] = curr_hit.j[step]; + } + } + if (curr_hit.S) { + m_temp_hit->S = new float[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + m_temp_hit->S[step] = curr_hit.S[step]; + } + } + if (curr_hit.S_ss) { + m_temp_hit->S_ss = new float[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + m_temp_hit->S_ss[step] = curr_hit.S_ss[step]; + } + } + if (curr_hit.states) { + m_temp_hit->states = new char[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + m_temp_hit->states[step] = curr_hit.states[step]; + } + } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -296,6 +331,58 @@ void PosteriorDecoder::restoreHitValues(Hit &curr_hit) { curr_hit.Eval = m_temp_hit->Eval; curr_hit.logEval = m_temp_hit->logEval; curr_hit.Probab = m_temp_hit->Probab; + if (m_temp_hit->i) { + delete[] m_temp_hit->i; + } + if (m_temp_hit->j) { + delete[] m_temp_hit->j; + } + if (m_temp_hit->S) { + delete[] m_temp_hit->S; + } + if (m_temp_hit->S_ss) { + delete[] m_temp_hit->S_ss; + } + if (m_temp_hit->states) { + delete[] m_temp_hit->states; + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Restore the current hit with Viterbi alignment path +/////////////////////////////////////////////////////////////////////////////////////////////////// +void PosteriorDecoder::restoreHitPath(Hit &curr_hit) { + curr_hit.nsteps = m_temp_hit->nsteps; + if (curr_hit.S) { + delete[] curr_hit.S; + } + curr_hit.S = new float[curr_hit.nsteps + 1]; + if (curr_hit.S_ss) { + delete[] curr_hit.S_ss; + curr_hit.S_ss = NULL; + } + if (m_temp_hit->S_ss) { + curr_hit.S_ss = new float[curr_hit.nsteps + 1]; + } + if (curr_hit.states) { + delete[] curr_hit.states; + } + curr_hit.states = new char[curr_hit.nsteps + 1]; + for(int step = 1; step <= curr_hit.nsteps; step++) { + curr_hit.i[step] = m_temp_hit->i[step]; + curr_hit.j[step] = m_temp_hit->j[step]; + curr_hit.S[step] = m_temp_hit->S[step]; + if (m_temp_hit->S_ss) { + curr_hit.S_ss[step] = m_temp_hit->S_ss[step]; + } + curr_hit.states[step] = m_temp_hit->states[step]; + curr_hit.alt_i->push_back(curr_hit.i[step]); + curr_hit.alt_j->push_back(curr_hit.j[step]); + } + curr_hit.i1 = curr_hit.i[curr_hit.nsteps]; + curr_hit.j1 = curr_hit.j[curr_hit.nsteps]; + curr_hit.i2 = curr_hit.i[1]; + curr_hit.j2 = curr_hit.j[1]; } void PosteriorDecoder::printVector(float * vec) { diff --git a/src/hhposteriordecoder.h b/src/hhposteriordecoder.h index 737ce017..dfbd74fd 100644 --- a/src/hhposteriordecoder.h +++ b/src/hhposteriordecoder.h @@ -66,7 +66,7 @@ class PosteriorDecoder { ///////////////////////////////////////////////////////////////////////////////////// void realign(HMM &q, HMM &t, Hit &hit, PosteriorMatrix &p_mm, ViterbiMatrix &viterbi_matrix, std::vector alignment_to_exclude, char * exclstr, - char* template_exclstr, int par_min_overlap, float shift, float mact, float corr); + char* template_exclstr, int par_min_overlap, float shift, float mact, float corr, float mac_min_length); void excludeMACAlignment(const int q_length, const int t_length, ViterbiMatrix &celloff_matrix, const int elem, MACBacktraceResult & alignment); @@ -125,7 +125,11 @@ class PosteriorDecoder { ViterbiMatrix & viterbi_matrix, float shift, const int elem); void macAlgorithm(HMM & q_hmm, HMM & t_hmm, Hit & hit_vec, PosteriorMatrix & p_mm, ViterbiMatrix & viterbi_matrix, float par_mact, const int elem); - void backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr); + void findBacktraceMAC(ViterbiMatrix & backtrace_matrix, const int elem, + int & i1, int & j1, int i2, int j2, std::vector & vi, std::vector & vj, std::vector & states, + int & matched_cols, int & nsteps, + LogLevel & actual_level); + void backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, ViterbiMatrix & backtrace_matrix, const int elem, Hit & hit, float corr, float mac_min_length); void writeProfilesToHits(HMM &q, HMM &t, PosteriorMatrix &p_mm, ViterbiMatrix & backtrace_matrix, Hit &hit); void initializeBacktrace(HMM & q, HMM & t, Hit & hit); @@ -134,6 +138,7 @@ class PosteriorDecoder { const int elem, Hit const &hit) const; void memorizeHitValues(Hit & curr_hit); void restoreHitValues(Hit &curr_hit); + void restoreHitPath(Hit &curr_hit); void printVector(simd_float * vec); void printVector(simd_int * vec); diff --git a/src/hhposteriordecoderrunner.cpp b/src/hhposteriordecoderrunner.cpp index 0baeda10..4ee59b59 100644 --- a/src/hhposteriordecoderrunner.cpp +++ b/src/hhposteriordecoderrunner.cpp @@ -97,7 +97,7 @@ void PosteriorDecoderRunner::executeComputation(HMM &q, std::vector hits // start realignment process decoder->realign(*q_hmm, *t_hmm[current_thread_id], *hit_cur, *m_posterior_matrices[current_thread_id], - *m_backtrace_matrix[current_thread_id], alignment_to_exclude, par.exclstr, par.template_exclstr, par.min_overlap, par.shift, par.mact, par.corr); + *m_backtrace_matrix[current_thread_id], alignment_to_exclude, par.exclstr, par.template_exclstr, par.min_overlap, par.shift, par.mact, par.corr, par.mac_min_length); // add result to exclution paths (needed to align 2nd, 3rd, ... best alignment) alignment_to_exclude.push_back(PosteriorDecoder::MACBacktraceResult(hit_cur->alt_i, hit_cur->alt_j)); } // end idb diff --git a/src/hhsearch.cpp b/src/hhsearch.cpp index 8adfe901..6c025e9c 100755 --- a/src/hhsearch.cpp +++ b/src/hhsearch.cpp @@ -130,6 +130,8 @@ void HHsearch::help(Parameters& par, char all) { printf(" -ovlp banded alignment: forbid largest diagonals |i-j| of DP matrix (def=%i)\n", par.min_overlap); printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n"); printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact); + printf(" -mac_min_length minimum length of MAC hit in comparison with the Viterbi hit\n"); + printf(" (in terms of matched_cols, default=%.2f)\n", par.mac_min_length); printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n"); if (all) { printf(" -realign realign displayed hits with max. accuracy (MAC) algorithm \n"); @@ -486,6 +488,9 @@ void HHsearch::ProcessArguments(Parameters& par) { else if ((!strcmp(argv[i], "-mact")) && (i < argc - 1)) par.mact = atof(argv[++i]); + else if (!strcmp(argv[i], "-mac_min_length") + && (i < argc - 1)) + par.mac_min_length = atof(argv[++i]); else if (!strcmp(argv[i], "-sc") && (i < argc - 1)) par.columnscore = atoi(argv[++i]); //no help required From 9229bfea30a656db93013e2d300e5528aff0d434 Mon Sep 17 00:00:00 2001 From: Dmitriy Samborskiy Date: Mon, 15 Jul 2019 01:12:47 +0300 Subject: [PATCH 3/5] hhdecl.cpp - default mac_min_length value = 0.0 The default for this option was changed from 0.9 to 0.0. When MAC is enabled user could agree that the hit will be shorter than the Viterbi hit, especially when high -mact values are used. Therefore discarding shorter hits by default would not be the best strategy (Milot's suggestion). --- src/hhdecl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hhdecl.cpp b/src/hhdecl.cpp index 8a5723d3..32b42f6c 100755 --- a/src/hhdecl.cpp +++ b/src/hhdecl.cpp @@ -85,7 +85,7 @@ Parameters::Parameters(const int argc, const char** argv) : argc(argc), argv(arg ssa = 1.0f; // weight of ss evolution matrix shift = -0.03f; // Shift match score up mact = 0.3501f; // Probability threshold for MAC alignment in local mode for alignment ends (set to 0.3501 to track user modification) - mac_min_length = 0.9f; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols) + mac_min_length = 0.0f; // Minimum length of MAC hit in comparison with the original Viterbi hit (in terms of matched_cols) corr = 0.1f; // Weight of correlations of scores for |i-j|<=4 egq = 0.0f; // no charge for end gaps as default From 1b12647442432df41efc7e03619910aed3979e1d Mon Sep 17 00:00:00 2001 From: Dmitriy Samborskiy Date: Mon, 15 Jul 2019 01:24:09 +0300 Subject: [PATCH 4/5] backtraceMAC - add warning when MAC hit is shorter than the Viterbi hit in terms of number of matched columns. This warning is issued when mac_min_length threshold is not triggered, i.e. when MAC hit is accepted despite of being shorter than the Viterbi hit. --- src/hhbacktracemac.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/hhbacktracemac.cpp b/src/hhbacktracemac.cpp index f00ec9cc..d06ef8dc 100644 --- a/src/hhbacktracemac.cpp +++ b/src/hhbacktracemac.cpp @@ -224,6 +224,9 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi snprintf(msg, sizeof(msg), "MAC alignment is too short, matched_cols=%d, the threshold is %f of the original matched_cols=%d: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", matched_cols, mac_min_length, hit.matched_cols, m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); HH_LOG(WARNING) << msg; mac_is_valid = 0; + } else if (matched_cols < hit.matched_cols) { + snprintf(msg, sizeof(msg), "MAC alignment is shorter (matched_cols=%d) than the Viterbi hit (matched_cols=%d). You can consider using lower -mact value in order to avoid such cases.\n", matched_cols, hit.matched_cols); + HH_LOG(WARNING) << msg; } if (!nsteps) { snprintf(msg, sizeof(msg), "Could not find valid MAC alternative hit for the original Viterbi hit scope: restoring the original Viterbi hit (%d,%d)-(%d,%d). You can consider using lower -mact value in order to avoid such cases.\n", m_temp_hit->i1, m_temp_hit->j1, m_temp_hit->i2, m_temp_hit->j2); From 03235849591457810dea47ee18573e511bf849b1 Mon Sep 17 00:00:00 2001 From: Dmitriy Samborskiy Date: Mon, 15 Jul 2019 01:54:01 +0300 Subject: [PATCH 5/5] backtraceMAC - improve the warning about MAC hit if it's shorter than the Viterbi hit (i.e. when -mac_min_length threshold is not triggered). Suggest not only lowering -mact value but also using -mac_min_length threshold. --- src/hhbacktracemac.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hhbacktracemac.cpp b/src/hhbacktracemac.cpp index d06ef8dc..2907d7cf 100644 --- a/src/hhbacktracemac.cpp +++ b/src/hhbacktracemac.cpp @@ -225,7 +225,7 @@ void PosteriorDecoder::backtraceMAC(HMM & q, HMM & t, PosteriorMatrix & p_mm, Vi HH_LOG(WARNING) << msg; mac_is_valid = 0; } else if (matched_cols < hit.matched_cols) { - snprintf(msg, sizeof(msg), "MAC alignment is shorter (matched_cols=%d) than the Viterbi hit (matched_cols=%d). You can consider using lower -mact value in order to avoid such cases.\n", matched_cols, hit.matched_cols); + snprintf(msg, sizeof(msg), "MAC alignment is shorter (matched_cols=%d) than the Viterbi hit (matched_cols=%d). You can consider using lower -mact value or higher -mac_min_length value in order to avoid such cases.\n", matched_cols, hit.matched_cols); HH_LOG(WARNING) << msg; } if (!nsteps) {