From 60e336df18a26873e5a13ec3aeae04a1b6e7cc29 Mon Sep 17 00:00:00 2001 From: Ruoshi Zhang Date: Mon, 11 May 2020 20:49:55 +0200 Subject: [PATCH 1/5] Update combine pval agg-mode 3 --- src/multihit/combinepvalperset.cpp | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/multihit/combinepvalperset.cpp b/src/multihit/combinepvalperset.cpp index 2c91f9a2d..30f0ee84d 100644 --- a/src/multihit/combinepvalperset.cpp +++ b/src/multihit/combinepvalperset.cpp @@ -166,21 +166,41 @@ class PvalueAggregator : public Aggregation { updatedPval = exp(sumLogPval); } - //3) the P-values of the truncated product method + //3) the P-values of the (modified) truncated product method else if(aggregationMode == Parameters::AGGREGATION_MODE_TRUNCATED_PRODUCT){ + //new theory: taking the best hit regardless of threshold and (from second hit on)sum of how much it surpassed threshold unsigned int orfCount = Util::fast_atoi(querySizeReader->getDataByDBKey(querySetKey, thread_idx)); double logPvalThreshold = log(alpha / (orfCount + 1)); - double sumLogPval = 0; + double minLogPval = 0; + double sumLogPval = 0; + size_t k = 0; for (size_t i = 0; i < dataToAggregate.size(); ++i) { double logPvalue = std::strtod(dataToAggregate[i][1].c_str(), NULL); + if (logPvalue < minLogPval) { + if (logPvalue == 0) { + //to avoid -0.0 + minLogPval = logPvalue; + } + else {minLogPval = -logPvalue;} + } if (logPvalue < logPvalThreshold) { - sumLogPval += logPvalue; + //sum up the part exceeding logThreshold, add a minus to make score positive + sumLogPval -= logPvalue - logPvalThreshold; + k++; } } - updatedPval = exp(sumLogPval); + if(k == 0){ + //if no hit passed thr, take the -log of best hit pval as score + buffer.append(SSTR(minLogPval)); + return buffer; + } + else { + //if one or more hits passed thr + buffer.append(SSTR(sumLogPval - logPvalThreshold)); + return buffer; + } } - else { Debug(Debug::ERROR) << "Invalid aggregation function!\n"; From aea233a94a9ba4a7f15fe3c99fa5147cf3f78bd5 Mon Sep 17 00:00:00 2001 From: Ruoshi Zhang Date: Fri, 2 Jul 2021 16:10:32 +0200 Subject: [PATCH 2/5] Improve gff2db to parse multiple files, and try multi-threading --- src/MMseqsBase.cpp | 4 +- src/commons/Parameters.cpp | 1 + src/util/gff2db.cpp | 231 ++++++++++++++++++++++--------------- 3 files changed, 142 insertions(+), 94 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 350e0eecf..7ac1f4c8a 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -1146,8 +1146,8 @@ std::vector baseCommands = { "Extract regions from a sequence database based on a GFF3 file", NULL, "Milot Mirdita ", - " ", - CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile }, + " ... ", + CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile }, {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}}, {"maskbygff", maskbygff, &par.gff2db, COMMAND_SPECIAL, diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 3b3763530..80c7faa0c 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -749,6 +749,7 @@ Parameters::Parameters(): // gff2db gff2db.push_back(&PARAM_GFF_TYPE); gff2db.push_back(&PARAM_ID_OFFSET); + gff2db.push_back(&PARAM_THREADS); gff2db.push_back(&PARAM_V); diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp index 2776c48df..de0864256 100644 --- a/src/util/gff2db.cpp +++ b/src/util/gff2db.cpp @@ -4,118 +4,165 @@ #include "Util.h" #include "MemoryMapped.h" #include "Orf.h" +#include "Parameters.h" + +#ifdef OPENMP +#include +#endif int gff2db(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); - par.parseParameters(argc, argv, command, true, 0, 0); + par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); + + std::vector filenames(par.filenames); + std::string outDb = par.filenames.back(); + par.filenames.pop_back(); + std::string seqDb = par.filenames.back(); + par.filenames.pop_back(); - MemoryMapped file(par.db1, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); - if (!file.isValid()) { - Debug(Debug::ERROR) << "Could not open GFF file " << par.db1 << "\n"; - EXIT(EXIT_FAILURE); - } - char *data = (char *) file.getData(); - DBReader reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader::USE_INDEX | DBReader::USE_DATA | DBReader::USE_LOOKUP_REV); + DBReader reader(seqDb.c_str(), (seqDb + ".index").c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA | DBReader::USE_LOOKUP_REV); reader.open(DBReader::NOSORT); - DBReader headerReader(par.hdr2.c_str(), par.hdr2Index.c_str(), 1, DBReader::USE_INDEX | DBReader::USE_DATA); + DBReader headerReader((seqDb + "_h").c_str(), (seqDb + "_h.index").c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); headerReader.open(DBReader::NOSORT); - DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_NUCLEOTIDES); + DBWriter writer(outDb.c_str(), (outDb + ".index").c_str(), par.threads, par.compressed, Parameters::DBTYPE_NUCLEOTIDES); writer.open(); - DBWriter headerWriter(par.hdr3.c_str(), par.hdr3Index.c_str(), 1, par.compressed, Parameters::DBTYPE_GENERIC_DB); + DBWriter headerWriter((outDb + "_h").c_str(), (outDb + "_h.index").c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB); headerWriter.open(); - bool shouldCompareType = par.gffType.length() > 0; - - Debug::Progress progress; - unsigned int entries_num = 0; - char buffer[1024]; - const char* fields[255]; - - std::string revStr; - revStr.reserve(par.maxSeqLen + 1); + FILE *lookup = FileUtil::openAndDelete((outDb + ".lookup").c_str(), "w"); + FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w"); - while (*data != '\0') { - progress.updateProgress(); - - // line is a comment - if (*data == '#') { - data = Util::skipLine(data); - continue; - } + bool shouldCompareType = par.gffType.length() > 0; - const size_t columns = Util::getWordsOfLine(data, fields, 255); - data = Util::skipLine(data); - if (columns < 9) { - Debug(Debug::WARNING) << "Not enough columns in GFF file\n"; - continue; - } + if (par.filenames.size() < reader.getSize()) { + Debug(Debug::WARNING) << "Not enough GFF files are provided. Some results might be omitted. \n"; + //EXIT(EXIT_FAILURE); + } - if (shouldCompareType) { - std::string type(fields[2], fields[3] - fields[2] - 1); - if (type.compare(par.gffType) != 0) { - continue; + Debug::Progress progress(par.filenames.size()); +#pragma omp parallel + { + int thread_idx = 0; + +#ifdef OPENMP + thread_idx = omp_get_thread_num(); +#endif + + unsigned int entries_num = 0; + char buffer[1024]; + const char* fields[255]; + + std::string filename; + filename.reserve(100); + + std::string revStr; + revStr.reserve(par.maxSeqLen + 1); + +#pragma omp for schedule(dynamic, 10) + for (size_t i = 0; i < par.filenames.size(); ++i) { + progress.updateProgress(); + filename = par.filenames[i]; + MemoryMapped file(filename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); + if (!file.isValid()) { + Debug(Debug::ERROR) << "Could not open GFF file " << filename << "\n"; + EXIT(EXIT_FAILURE); } - } - - size_t start = Util::fast_atoi(fields[3]); - size_t end = Util::fast_atoi(fields[4]); - if (start == end) { - Debug(Debug::WARNING) << "Invalid sequence length in line " << entries_num << "!\n"; - continue; - } - - std::string name(fields[0], fields[1] - fields[0] - 1); - size_t lookupId = reader.getLookupIdByAccession(name); - if (lookupId == SIZE_MAX) { - Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "!\n"; - return EXIT_FAILURE; - } - unsigned int key = reader.getLookupKey(lookupId); - - size_t headerId = headerReader.getId(key); - if (headerId == UINT_MAX) { - Debug(Debug::ERROR) << "GFF entry not found in header database: " << name << "!\n"; - return EXIT_FAILURE; - } - unsigned int id = par.identifierOffset + entries_num; - - headerWriter.writeStart(0); - headerWriter.writeAdd(headerReader.getData(headerId, 0), std::max(headerReader.getSeqLen(headerId), (size_t)2) - 2, 0); - int len = snprintf(buffer, 1024, " %zu-%zu\n", start, end); - headerWriter.writeAdd(buffer, len, 0); - headerWriter.writeEnd(id, 0); - - size_t seqId = reader.getId(key); - if (seqId == UINT_MAX) { - Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "!\n"; - return EXIT_FAILURE; - } - - ssize_t length = end - start; - char *seq = reader.getData(seqId, 0); - - writer.writeStart(0); - if (length > 0) { - writer.writeAdd(seq + start, length + 1, 0); - } else { - for (size_t i = start; i >= start + length; i--) { - revStr.append(1, Orf::complement(seq[i])); + char *data = (char *) file.getData(); + size_t idx = 0; + fprintf(source, "%zu\t%s\n", i, filename.c_str()); + while (*data != '\0') { + // line is a comment + if (*data == '#') { + data = Util::skipLine(data); + continue; + } + //TODO: multi-word column 2 could exist + const size_t columns = Util::getWordsOfLine(data, fields, 255); + data = Util::skipLine(data); + if (columns < 9) { + Debug(Debug::WARNING) << "Not enough columns in GFF file\n"; + continue; + } + + if (shouldCompareType) { + std::string type(fields[2], fields[3] - fields[2] - 1); + if (type.compare(par.gffType) != 0) { + continue; + } + } + + size_t start = Util::fast_atoi(fields[3]); + size_t end = Util::fast_atoi(fields[4]); + if (start == end) { + Debug(Debug::WARNING) << "Invalid sequence length in line " << entries_num << "!\n"; + continue; + } + std::string strand(fields[6], fields[7] - fields[6] - 1); + std::string name(fields[0], fields[1] - fields[0] - 1); + size_t lookupId = reader.getLookupIdByAccession(name); + if (lookupId == SIZE_MAX) { + Debug(Debug::ERROR) << "GFF entry not found in database lookup: " << name << "!\n"; + EXIT(EXIT_FAILURE); + } + unsigned int key = reader.getLookupKey(lookupId); + + size_t headerId = headerReader.getId(key); + if (headerId == UINT_MAX) { + Debug(Debug::ERROR) << "GFF entry not found in header database: " << name << "!\n"; + EXIT(EXIT_FAILURE); + } + unsigned int id = par.identifierOffset + entries_num; + + idx++; + if (strand == "+") { + Orf::writeOrfHeader(buffer, key, start, end, 0, 0); + } else { + Orf::writeOrfHeader(buffer, key, end, start, 0, 0); + } + headerWriter.writeData(buffer, strlen(buffer), key, thread_idx); + + + size_t seqId = reader.getId(key); + if (seqId == UINT_MAX) { + Debug(Debug::ERROR) << "GFF entry not found in sequence database: " << name << "!\n"; + EXIT(EXIT_FAILURE); + } + + //check the strand instead of end - start + ssize_t length = end - start + 1; + char *seq = reader.getData(seqId, thread_idx); + + writer.writeStart(thread_idx); + if (strand == "+") { + fprintf(lookup, "%u\t%s_%zu_%zu_%zu\t%zu\n", id, name.c_str(), idx, start, end, i); + writer.writeAdd(seq + start - 1 , length, thread_idx); + } else { + fprintf(lookup, "%u\t%s_%zu_%zu_%zu\t%zu\n", id, name.c_str(), idx, end, start, i); + for (size_t i = end - 1 ; i >= end - length; i--) { + revStr.append(1, Orf::complement(seq[i])); + } + if (revStr.size() % 3 != 0) { + Debug(Debug::WARNING) << "The nucleotide sequence length is not divisible by 3!\n"; + } + writer.writeAdd(revStr.c_str(), revStr.size(), thread_idx); + revStr.clear(); + } + writer.writeAdd("\n", 1, thread_idx); + writer.writeEnd(id, thread_idx); + + entries_num++; } - writer.writeAdd(revStr.c_str(), revStr.size(), 0); - revStr.clear(); + file.close(); } - writer.writeAdd("\n", 1, 0); - writer.writeEnd(id, 0); - - entries_num++; } - headerWriter.close(); - writer.close(); + fclose(lookup); + fclose(source); + headerWriter.close(true); + writer.close(true); headerReader.close(); reader.close(); - file.close(); return EXIT_SUCCESS; } From 5e9ee7419ef91b7450c7eef571973debd76226af Mon Sep 17 00:00:00 2001 From: Ruoshi Zhang Date: Thu, 21 Oct 2021 15:09:43 +0200 Subject: [PATCH 3/5] createRenumberedDB can also keep entry name --- src/commons/DBWriter.cpp | 6 ++++-- src/commons/DBWriter.h | 2 +- src/util/gff2db.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp index 9ae84bae1..eae05b717 100644 --- a/src/commons/DBWriter.cpp +++ b/src/commons/DBWriter.cpp @@ -678,7 +678,7 @@ void DBWriter::writeThreadBuffer(unsigned int idx, size_t dataSize) { } } -void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode) { +void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, bool keepEntryName, int sortMode) { DBReader* lookupReader = NULL; FILE *sLookup = NULL; if (origData.empty() == false && origIndex.empty() == false) { @@ -711,7 +711,9 @@ void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string size_t lookupId = lookupReader->getLookupIdByKey(idx->id); DBReader::LookupEntry copy = lookup[lookupId]; copy.id = i; - copy.entryName = SSTR(idx->id); + if(!keepEntryName){ + copy.entryName = SSTR(idx->id); + } lookupReader->lookupEntryToBuffer(strBuffer, copy); written = fwrite(strBuffer.c_str(), sizeof(char), strBuffer.size(), sLookup); if (written != (int) strBuffer.size()) { diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h index 9d86374d7..409a509c5 100644 --- a/src/commons/DBWriter.h +++ b/src/commons/DBWriter.h @@ -59,7 +59,7 @@ class DBWriter : public MemoryTracker { template static void writeIndexEntryToFile(FILE *outFile, char *buff1, T &index); - static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode = DBReader::SORT_BY_ID_OFFSET); + static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, bool keepEntryName = false, int sortMode = DBReader::SORT_BY_ID_OFFSET); bool isClosed(){ return closed; diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp index 92daa0bba..001c62801 100644 --- a/src/util/gff2db.cpp +++ b/src/util/gff2db.cpp @@ -33,7 +33,7 @@ int gff2db(int argc, const char **argv, const Command &command) { headerWriter.open(); std::string outLookup = outDb + ".lookup"; std::string outLookupIndex = outDb + ".lookup.index"; - DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, 0, Parameters::DBTYPE_OMIT_FILE); + DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_OMIT_FILE); lookupWriter.open(); FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w"); @@ -198,7 +198,7 @@ int gff2db(int argc, const char **argv, const Command &command) { #pragma omp task { - DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex); + DBWriter::createRenumberedDB(outDb, outDbIndex, outDb, outDbIndex, true); } } } From fcae85abc88cd88a692b5d37ed69fe568e960a49 Mon Sep 17 00:00:00 2001 From: Ruoshi Zhang Date: Tue, 11 Jan 2022 14:43:26 +0100 Subject: [PATCH 4/5] Fix seg fault when memorymapped handles files that end exactly at the end of page --- src/util/convertalignments.cpp | 6 ++++-- src/util/gff2db.cpp | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp index 87633f1ed..1cc59a131 100644 --- a/src/util/convertalignments.cpp +++ b/src/util/convertalignments.cpp @@ -101,8 +101,9 @@ std::map readKeyToSet(const std::string& file) { MemoryMapped lookup(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); char* data = (char *) lookup.getData(); + char* end = data + lookup.mappedSize(); const char* entry[255]; - while (*data != '\0') { + while (data < end && *data != '\0') { const size_t columns = Util::getWordsOfLine(data, entry, 255); if (columns < 3) { Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; @@ -124,8 +125,9 @@ std::map readSetToSource(const std::string& file) { MemoryMapped source(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); char* data = (char *) source.getData(); + char* end = data + source.mappedSize(); const char* entry[255]; - while (*data != '\0') { + while (data < end && *data != '\0') { const size_t columns = Util::getWordsOfLine(data, entry, 255); if (columns < 2) { Debug(Debug::WARNING) << "Not enough columns in lookup file " << file << "\n"; diff --git a/src/util/gff2db.cpp b/src/util/gff2db.cpp index 001c62801..3db8899ae 100644 --- a/src/util/gff2db.cpp +++ b/src/util/gff2db.cpp @@ -84,8 +84,9 @@ int gff2db(int argc, const char **argv, const Command &command) { EXIT(EXIT_FAILURE); } char *data = (char *) file.getData(); + char* end = data + file.mappedSize(); size_t idx = 0; - while (*data != '\0') { + while (data < end && *data != '\0') { // line is a comment or empty if (*data == '#' || *data == '\n') { data = Util::skipLine(data); From 777c20355201435431c93351168a2d4d957ce704 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Wed, 16 Nov 2022 16:23:14 +0900 Subject: [PATCH 5/5] Update regression to fix oversubscription error in mpi test --- util/regression | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/regression b/util/regression index 622b6d310..027d37c2c 160000 --- a/util/regression +++ b/util/regression @@ -1 +1 @@ -Subproject commit 622b6d3106ca5acfa690a003b656fa2c9b3ea59b +Subproject commit 027d37c2c2f52fd9d1625b11003efc1728b524c8