From 60e336df18a26873e5a13ec3aeae04a1b6e7cc29 Mon Sep 17 00:00:00 2001
From: Ruoshi Zhang <rosyzhang9701@gmail.com>
Date: Mon, 11 May 2020 20:49:55 +0200
Subject: [PATCH 1/5] Update combine pval agg-mode 3

---
 src/multihit/combinepvalperset.cpp | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/src/multihit/combinepvalperset.cpp b/src/multihit/combinepvalperset.cpp
index 2c91f9a2d..30f0ee84d 100644
--- a/src/multihit/combinepvalperset.cpp
+++ b/src/multihit/combinepvalperset.cpp
@@ -166,21 +166,41 @@ class PvalueAggregator : public Aggregation {
             updatedPval = exp(sumLogPval);   
         }
 
-        //3) the P-values of the truncated product method 
+        //3) the P-values of the (modified) truncated product method 
         else if(aggregationMode == Parameters::AGGREGATION_MODE_TRUNCATED_PRODUCT){
+            //new theory: taking the best hit regardless of threshold and (from second hit on)sum of how much it surpassed threshold
             unsigned int orfCount = Util::fast_atoi<unsigned int>(querySizeReader->getDataByDBKey(querySetKey, thread_idx));
             double logPvalThreshold = log(alpha / (orfCount + 1));
-            double sumLogPval = 0;
+            double minLogPval = 0;
+            double sumLogPval = 0; 
+            size_t k = 0;
             for (size_t i = 0; i < dataToAggregate.size(); ++i) {
                 double logPvalue = std::strtod(dataToAggregate[i][1].c_str(), NULL);
+                if (logPvalue < minLogPval) {
+                    if (logPvalue == 0) {
+                        //to avoid -0.0
+                        minLogPval = logPvalue;
+                    }
+                    else {minLogPval = -logPvalue;}
+                }
                 if (logPvalue < logPvalThreshold) {
-                    sumLogPval += logPvalue;
+                    //sum up the part exceeding logThreshold, add a minus to make score positive
+                    sumLogPval -= logPvalue - logPvalThreshold;
+                    k++;
                 }
             }
-            updatedPval = exp(sumLogPval); 
+            if(k == 0){
+                //if no hit passed thr, take the -log of best hit pval as score
+                buffer.append(SSTR(minLogPval));
+                return buffer;
+            }
+            else {
+                //if one or more hits passed thr
+                buffer.append(SSTR(sumLogPval - logPvalThreshold));
+                return buffer;
+            }
         }
 
-
         
         else {
             Debug(Debug::ERROR) << "Invalid aggregation function!\n";

From aea233a94a9ba4a7f15fe3c99fa5147cf3f78bd5 Mon Sep 17 00:00:00 2001
From: Ruoshi Zhang <rosyzhang9701@gmail.com>
Date: Fri, 2 Jul 2021 16:10:32 +0200
Subject: [PATCH 2/5] Improve gff2db to parse multiple files, and try
 multi-threading

---
 src/MMseqsBase.cpp         |   4 +-
 src/commons/Parameters.cpp |   1 +
 src/util/gff2db.cpp        | 231 ++++++++++++++++++++++---------------
 3 files changed, 142 insertions(+), 94 deletions(-)

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 350e0eecf..7ac1f4c8a 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -1146,8 +1146,8 @@ std::vector<Command> baseCommands = {
                 "Extract regions from a sequence database based on a GFF3 file",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:gff3File> <i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                "<i:gff3File1> ... <i:gff3FileN> <i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
                                                            {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
         {"maskbygff",            maskbygff,            &par.gff2db,               COMMAND_SPECIAL,
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 3b3763530..80c7faa0c 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -749,6 +749,7 @@ Parameters::Parameters():
     // gff2db
     gff2db.push_back(&PARAM_GFF_TYPE);
     gff2db.push_back(&PARAM_ID_OFFSET);
+    gff2db.push_back(&PARAM_THREADS);
     gff2db.push_back(&PARAM_V);
 
 
diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp
index 2776c48df..de0864256 100644
--- a/src/util/gff2db.cpp
+++ b/src/util/gff2db.cpp
@@ -4,118 +4,165 @@
 #include "Util.h"
 #include "MemoryMapped.h"
 #include "Orf.h"
+#include "Parameters.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
 
 int gff2db(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
-    par.parseParameters(argc, argv, command, true, 0, 0);
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+
+    std::vector<std::string> filenames(par.filenames);
+    std::string outDb = par.filenames.back();
+    par.filenames.pop_back();
+    std::string seqDb = par.filenames.back();
+    par.filenames.pop_back();
 
-    MemoryMapped file(par.db1, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
-    if (!file.isValid()) {
-        Debug(Debug::ERROR) << "Could not open GFF file " << par.db1 << "\n";
-        EXIT(EXIT_FAILURE);
-    }
-    char *data = (char *) file.getData();
 
-    DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_LOOKUP_REV);
+    DBReader<unsigned int> reader(seqDb.c_str(), (seqDb + ".index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_LOOKUP_REV);
     reader.open(DBReader<unsigned int>::NOSORT);
-    DBReader<unsigned int> headerReader(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    DBReader<unsigned int> headerReader((seqDb + "_h").c_str(), (seqDb + "_h.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
     headerReader.open(DBReader<unsigned int>::NOSORT);
 
-    DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_NUCLEOTIDES);
+    DBWriter writer(outDb.c_str(), (outDb + ".index").c_str(), par.threads, par.compressed, Parameters::DBTYPE_NUCLEOTIDES);
     writer.open();
-    DBWriter headerWriter(par.hdr3.c_str(), par.hdr3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    DBWriter headerWriter((outDb + "_h").c_str(), (outDb + "_h.index").c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
     headerWriter.open();
 
-    bool shouldCompareType = par.gffType.length() > 0;
-
-    Debug::Progress progress;
-    unsigned int entries_num = 0;
-    char buffer[1024];
-    const char* fields[255];
-
-    std::string revStr;
-    revStr.reserve(par.maxSeqLen + 1);
+    FILE *lookup = FileUtil::openAndDelete((outDb + ".lookup").c_str(), "w");
+    FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w");
 
-    while (*data != '\0') {
-        progress.updateProgress();
-
-        // line is a comment
-        if (*data == '#') {
-            data = Util::skipLine(data);
-            continue;
-        }
+    bool shouldCompareType = par.gffType.length() > 0;
 
-        const size_t columns = Util::getWordsOfLine(data, fields, 255);
-        data = Util::skipLine(data);
-        if (columns < 9) {
-            Debug(Debug::WARNING) << "Not enough columns in GFF file\n";
-            continue;
-        }
+    if (par.filenames.size() < reader.getSize()) {
+        Debug(Debug::WARNING) << "Not enough GFF files are provided. Some results might be omitted. \n";
+        //EXIT(EXIT_FAILURE);
+    }
 
-        if (shouldCompareType) {
-            std::string type(fields[2], fields[3] - fields[2] - 1);
-            if (type.compare(par.gffType) != 0) {
-                continue;
+    Debug::Progress progress(par.filenames.size());
+#pragma omp parallel
+    {
+        int thread_idx = 0;
+        
+#ifdef OPENMP
+        thread_idx = omp_get_thread_num();
+#endif
+
+        unsigned int entries_num = 0;
+        char buffer[1024];
+        const char* fields[255];
+
+        std::string filename;
+        filename.reserve(100);
+
+        std::string revStr;
+        revStr.reserve(par.maxSeqLen + 1);
+
+#pragma omp for schedule(dynamic, 10)
+        for (size_t i = 0; i < par.filenames.size(); ++i) {
+            progress.updateProgress();
+            filename = par.filenames[i];
+            MemoryMapped file(filename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
+            if (!file.isValid()) {
+                Debug(Debug::ERROR) << "Could not open GFF file " << filename << "\n";
+                EXIT(EXIT_FAILURE);
             }
-        }
-
-        size_t start = Util::fast_atoi<size_t>(fields[3]);
-        size_t end = Util::fast_atoi<size_t>(fields[4]);
-        if (start == end) {
-            Debug(Debug::WARNING) << "Invalid sequence length in line " << entries_num << "!\n";
-            continue;
-        }
-
-        std::string name(fields[0], fields[1] - fields[0] - 1);
-        size_t lookupId = reader.getLookupIdByAccession(name);
-        if (lookupId == SIZE_MAX) {
-            Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "!\n";
-            return EXIT_FAILURE;
-        }
-        unsigned int key = reader.getLookupKey(lookupId);
-
-        size_t headerId = headerReader.getId(key);
-        if (headerId == UINT_MAX) {
-            Debug(Debug::ERROR) << "GFF entry not found in header database: " << name << "!\n";
-            return EXIT_FAILURE;
-        }
-        unsigned int id = par.identifierOffset + entries_num;
-
-        headerWriter.writeStart(0);
-        headerWriter.writeAdd(headerReader.getData(headerId, 0), std::max(headerReader.getSeqLen(headerId), (size_t)2) - 2, 0);
-        int len = snprintf(buffer, 1024, " %zu-%zu\n", start, end);
-        headerWriter.writeAdd(buffer, len, 0);
-        headerWriter.writeEnd(id, 0);
-
-        size_t seqId = reader.getId(key);
-        if (seqId == UINT_MAX) {
-            Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "!\n";
-            return EXIT_FAILURE;
-        }
-
-        ssize_t length = end - start;
-        char *seq = reader.getData(seqId, 0);
-
-        writer.writeStart(0);
-        if (length > 0) {
-            writer.writeAdd(seq + start, length + 1, 0);
-        } else {
-            for (size_t i = start; i >= start + length; i--) {
-                revStr.append(1, Orf::complement(seq[i]));
+            char *data = (char *) file.getData();
+            size_t idx = 0;
+            fprintf(source, "%zu\t%s\n", i, filename.c_str());
+            while (*data != '\0') {
+                // line is a comment
+                if (*data == '#') {
+                    data = Util::skipLine(data);
+                    continue;
+                }
+                //TODO: multi-word column 2 could exist
+                const size_t columns = Util::getWordsOfLine(data, fields, 255);
+                data = Util::skipLine(data);
+                if (columns < 9) {
+                    Debug(Debug::WARNING) << "Not enough columns in GFF file\n";
+                    continue;
+                }
+
+                if (shouldCompareType) {
+                    std::string type(fields[2], fields[3] - fields[2] - 1);
+                    if (type.compare(par.gffType) != 0) {
+                        continue;
+                    }
+                }
+
+                size_t start = Util::fast_atoi<size_t>(fields[3]);
+                size_t end = Util::fast_atoi<size_t>(fields[4]);
+                if (start == end) {
+                    Debug(Debug::WARNING) << "Invalid sequence length in line " << entries_num << "!\n";
+                    continue;
+                }
+                std::string strand(fields[6], fields[7] - fields[6] - 1);
+                std::string name(fields[0], fields[1] - fields[0] - 1);
+                size_t lookupId = reader.getLookupIdByAccession(name);
+                if (lookupId == SIZE_MAX) {
+                    Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "!\n";
+                    EXIT(EXIT_FAILURE);
+                }
+                unsigned int key = reader.getLookupKey(lookupId);
+
+                size_t headerId = headerReader.getId(key);
+                if (headerId == UINT_MAX) {
+                    Debug(Debug::ERROR) << "GFF entry not found in header database: " << name << "!\n";
+                    EXIT(EXIT_FAILURE);
+                }
+                unsigned int id = par.identifierOffset + entries_num;
+
+                idx++;
+                if (strand == "+") {
+                    Orf::writeOrfHeader(buffer, key, start, end, 0, 0);
+                } else {
+                    Orf::writeOrfHeader(buffer, key, end, start, 0, 0);
+                }
+                headerWriter.writeData(buffer, strlen(buffer), key, thread_idx);
+                
+
+                size_t seqId = reader.getId(key);
+                if (seqId == UINT_MAX) {
+                    Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "!\n";
+                    EXIT(EXIT_FAILURE);
+                }
+
+                //check the strand instead of end - start 
+                ssize_t length = end - start + 1;
+                char *seq = reader.getData(seqId, thread_idx);
+
+                writer.writeStart(thread_idx);
+                if (strand == "+") {
+                    fprintf(lookup, "%u\t%s_%zu_%zu_%zu\t%zu\n", id, name.c_str(), idx, start, end, i);
+                    writer.writeAdd(seq + start - 1 , length, thread_idx);
+                } else {
+                    fprintf(lookup, "%u\t%s_%zu_%zu_%zu\t%zu\n", id, name.c_str(), idx, end, start, i);
+                    for (size_t i = end - 1 ; i >= end - length; i--) {
+                        revStr.append(1, Orf::complement(seq[i]));
+                    }
+                if (revStr.size() % 3 != 0) {
+                    Debug(Debug::WARNING) << "The nucleotide sequence length is not divisible by 3!\n";
+                }  
+                    writer.writeAdd(revStr.c_str(), revStr.size(), thread_idx);
+                    revStr.clear();
+                }
+                writer.writeAdd("\n", 1, thread_idx);
+                writer.writeEnd(id, thread_idx);
+
+                entries_num++;
             }
-            writer.writeAdd(revStr.c_str(), revStr.size(), 0);
-            revStr.clear();
+            file.close();
         }
-        writer.writeAdd("\n", 1, 0);
-        writer.writeEnd(id, 0);
-
-        entries_num++;
     }
-    headerWriter.close();
-    writer.close();
+    fclose(lookup);
+    fclose(source);
+    headerWriter.close(true);
+    writer.close(true);
     headerReader.close();
     reader.close();
-    file.close();
 
     return EXIT_SUCCESS;
 }

From 5e9ee7419ef91b7450c7eef571973debd76226af Mon Sep 17 00:00:00 2001
From: Ruoshi Zhang <rosyzhang9701@gmail.com>
Date: Thu, 21 Oct 2021 15:09:43 +0200
Subject: [PATCH 3/5] createRenumberedDB can also keep entry name

---
 src/commons/DBWriter.cpp | 6 ++++--
 src/commons/DBWriter.h   | 2 +-
 src/util/gff2db.cpp      | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp
index 9ae84bae1..eae05b717 100644
--- a/src/commons/DBWriter.cpp
+++ b/src/commons/DBWriter.cpp
@@ -678,7 +678,7 @@ void DBWriter::writeThreadBuffer(unsigned int idx, size_t dataSize) {
     }
 }
 
-void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode) {
+void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, bool keepEntryName, int sortMode) {
     DBReader<unsigned int>* lookupReader = NULL;
     FILE *sLookup = NULL;
     if (origData.empty() == false && origIndex.empty() == false) {
@@ -711,7 +711,9 @@ void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string
             size_t lookupId = lookupReader->getLookupIdByKey(idx->id);
             DBReader<unsigned int>::LookupEntry copy = lookup[lookupId];
             copy.id = i;
-            copy.entryName = SSTR(idx->id);
+            if(!keepEntryName){
+                copy.entryName = SSTR(idx->id);
+            }
             lookupReader->lookupEntryToBuffer(strBuffer, copy);
             written = fwrite(strBuffer.c_str(), sizeof(char), strBuffer.size(), sLookup);
             if (written != (int) strBuffer.size()) {
diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h
index 9d86374d7..409a509c5 100644
--- a/src/commons/DBWriter.h
+++ b/src/commons/DBWriter.h
@@ -59,7 +59,7 @@ class DBWriter : public MemoryTracker  {
     template <typename T>
     static void writeIndexEntryToFile(FILE *outFile, char *buff1, T &index);
 
-    static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode = DBReader<unsigned int>::SORT_BY_ID_OFFSET);
+    static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, bool keepEntryName = false, int sortMode = DBReader<unsigned int>::SORT_BY_ID_OFFSET);
 
     bool isClosed(){
         return closed;
diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp
index 92daa0bba..001c62801 100644
--- a/src/util/gff2db.cpp
+++ b/src/util/gff2db.cpp
@@ -33,7 +33,7 @@ int gff2db(int argc, const char **argv, const Command &command) {
     headerWriter.open();
     std::string outLookup = outDb + ".lookup";
     std::string outLookupIndex = outDb + ".lookup.index";
-    DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, 0, Parameters::DBTYPE_OMIT_FILE);
+    DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_OMIT_FILE);
     lookupWriter.open();
 
     FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w");
@@ -198,7 +198,7 @@ int gff2db(int argc, const char **argv, const Command &command) {
 
 #pragma omp task
                 {
-                    DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex);
+                    DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex, true);
                 }
             }
         }

From fcae85abc88cd88a692b5d37ed69fe568e960a49 Mon Sep 17 00:00:00 2001
From: Ruoshi Zhang <rosyzhang9701@gmail.com>
Date: Tue, 11 Jan 2022 14:43:26 +0100
Subject: [PATCH 4/5] Fix seg fault when memorymapped handles files that end
 exactly at the end of page

---
 src/util/convertalignments.cpp | 6 ++++--
 src/util/gff2db.cpp            | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp
index 87633f1ed..1cc59a131 100644
--- a/src/util/convertalignments.cpp
+++ b/src/util/convertalignments.cpp
@@ -101,8 +101,9 @@ std::map<unsigned int, unsigned int> readKeyToSet(const std::string& file) {
 
     MemoryMapped lookup(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
     char* data = (char *) lookup.getData();
+    char* end = data + lookup.mappedSize();
     const char* entry[255];
-    while (*data != '\0') {
+    while (data < end && *data != '\0') {
         const size_t columns = Util::getWordsOfLine(data, entry, 255);
         if (columns < 3) {
             Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n";
@@ -124,8 +125,9 @@ std::map<unsigned int, std::string> readSetToSource(const std::string& file) {
 
     MemoryMapped source(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
     char* data = (char *) source.getData();
+    char* end = data + source.mappedSize();
     const char* entry[255];
-    while (*data != '\0') {
+    while (data < end && *data != '\0') {
         const size_t columns = Util::getWordsOfLine(data, entry, 255);
         if (columns < 2) {
             Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n";
diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp
index 001c62801..3db8899ae 100644
--- a/src/util/gff2db.cpp
+++ b/src/util/gff2db.cpp
@@ -84,8 +84,9 @@ int gff2db(int argc, const char **argv, const Command &command) {
                 EXIT(EXIT_FAILURE);
             }
             char *data = (char *) file.getData();
+            char* end = data + file.mappedSize();
             size_t idx = 0;
-            while (*data != '\0') {
+            while (data < end && *data != '\0') {
                 // line is a comment or empty
                 if (*data == '#' || *data == '\n') {
                     data = Util::skipLine(data);

From 777c20355201435431c93351168a2d4d957ce704 Mon Sep 17 00:00:00 2001
From: Milot Mirdita <milot@mirdita.de>
Date: Wed, 16 Nov 2022 16:23:14 +0900
Subject: [PATCH 5/5] Update regression to fix oversubscription error in mpi
 test

---
 util/regression | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/regression b/util/regression
index 622b6d310..027d37c2c 160000
--- a/util/regression
+++ b/util/regression
@@ -1 +1 @@
-Subproject commit 622b6d3106ca5acfa690a003b656fa2c9b3ea59b
+Subproject commit 027d37c2c2f52fd9d1625b11003efc1728b524c8