diff --git a/CMakeLists.txt b/CMakeLists.txt index 902e3d87..6fa888bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ set(LIBRARY_NAME file(GLOB kProcessorLibSrc ${CMAKE_CURRENT_SOURCE_DIR}/src/algorithms.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/extend_algorithms.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/kDataFrames/defaultColumn.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/kDataFrames/kDataFrameBlight.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/kDataFrames/kDataFrameBMQF.cpp @@ -118,8 +119,8 @@ file(GLOB kProcessorLibSrc ${CMAKE_CURRENT_SOURCE_DIR}/src/kDataFrames/kDataFramePHMAP.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Utils/utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ThirdParty/KMC/kmc_api/*cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/restriction_tags.cpp ) diff --git a/include/kProcessor/kDataFrame.hpp b/include/kProcessor/kDataFrame.hpp index 362c0ff3..1f3cfe0f 100644 --- a/include/kProcessor/kDataFrame.hpp +++ b/include/kProcessor/kDataFrame.hpp @@ -14,6 +14,8 @@ #include "defaultColumn.hpp" #include +#include "restriction_tags.hpp" + using phmap::flat_hash_map; using namespace std; @@ -295,7 +297,7 @@ class kDataFrame{ uint32_t lastCheckpoint; Column* defaultColumn; virtual void preprocessKmerOrder(); - + tag * kDataFrame_tags; kDataFrameIterator* endIterator; public: bool isKmersOrderComputed; @@ -553,6 +555,7 @@ class kDataFrameMQF: public kDataFrame{ kDataFrameMQF(); explicit kDataFrameMQF(std::uint64_t kSize); kDataFrameMQF(std::uint64_t kSize, hashingModes hash_mode); + kDataFrameMQF(QF *mqf, readingModes RM, hashingModes HM, map params); kDataFrameMQF(std::uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize ,double falsePositiveRate); @@ -660,6 +663,7 @@ class kDataFrameBMQF: public kDataFrame{ kDataFrameBMQF(std::uint64_t kSize,uint64_t nKmers,string path); kDataFrameBMQF(std::uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize,double falsePositiveRate,string path); kDataFrameBMQF(bufferedMQF* bufferedmqf,std::uint64_t ksize,double falsePositiveRate); + kDataFrameBMQF(bufferedMQF* bufferedmqf, readingModes RM, hashingModes HM, map params); //count histogram is array where count of kmers repeated n times is found at index n. index 0 holds number of distinct kmers. kDataFrameBMQF(std::uint64_t ksize,vector countHistogram,uint8_t tagSize ,double falsePositiveRate); @@ -736,6 +740,7 @@ class kDataFrameMAP : public kDataFrame kDataFrameMAP(std::uint64_t ksize); kDataFrameMAP(std::uint64_t kSize,vector kmersHistogram); kDataFrameMAP(std::uint64_t kSize,uint64_t nKmers); + kDataFrameMAP(readingModes RM, hashingModes HM, map params); kDataFrame* getTwin(); void reserve (std::uint64_t n); void reserve (vector countHistogram); diff --git a/include/kProcessor/restriction_tags.hpp b/include/kProcessor/restriction_tags.hpp new file mode 100644 index 00000000..736165a8 --- /dev/null +++ b/include/kProcessor/restriction_tags.hpp @@ -0,0 +1,47 @@ +#ifndef _RESTRICTION_TAGS_H_ +#define _RESTRICTION_TAGS_H_ + +#include +#include +#include +#include +#include + +using std::map; +using std::string; +using std::vector; +using std::to_string; +using std::stoi; +using std::cout; +using std::cerr; +using std::endl; + + +class tag { + + public: + vector active_tags; + map restrictions = { + {"min_kSize", 7}, + {"max_kSize", 31}, + {"sorted", false}, + }; + + tag(){} + tag(map tags); + + void add_restriction(string tag_name, int value); + void check_restrictions(); + + void tag_min_kSize(int value); + void tag_max_kSize(int value); + + ~tag() {} + }; + + +typedef void (tag::*intFunc)(int); +typedef map intFuncMap; + + +#endif diff --git a/src/kDataFrames/kDataFrameBMQF.cpp b/src/kDataFrames/kDataFrameBMQF.cpp index f6ac825d..429cb357 100644 --- a/src/kDataFrames/kDataFrameBMQF.cpp +++ b/src/kDataFrames/kDataFrameBMQF.cpp @@ -73,6 +73,21 @@ kDataFrameBMQF::kDataFrameBMQF(uint64_t ksize,uint64_t nKmers,string path): fileName=path; reserve(nKmers); } + +kDataFrameBMQF::kDataFrameBMQF(bufferedMQF* bufferedmqf, readingModes RM, hashingModes HM, map params){ + this->bufferedmqf=bufferedmqf; + this->falsePositiveRate=falsePositiveRate; + KD = kmerDecoder::getInstance(RM, HM, params); + this->kSize = KD->get_kSize(); + hashbits=this->bufferedmqf->memoryBuffer->metadata->key_bits; + hashbits=2*kSize; + range=(1ULL<endIterator(); + endIterator=new kDataFrameIterator(it,(kDataFrame*)this); + +} + kDataFrameBMQF::kDataFrameBMQF(bufferedMQF* bufferedmqf,uint64_t ksize,double falsePositiveRate): kDataFrame(ksize) { @@ -414,24 +429,29 @@ void kDataFrameBMQF::serialize(string filePath){ ofstream file(filePath+".extra"); file<KD->hash_mode << endl; - file.close(); - + file << this->KD->slicing_mode << endl; + file << this->KD->params_to_string() << endl; bufferedMQF_serialize(bufferedmqf); } kDataFrame* kDataFrameBMQF::load(string filePath){ - ifstream file(filePath+".extra"); + uint64_t filekSize; - int hashing_mode; - double flasePositiveRate; - file>>filekSize; + int hashing_mode, reading_mode; + string KD_params_string; + + ifstream file(filePath+".extra"); + file >> filekSize; file >> hashing_mode; - - flasePositiveRate = (hashing_mode == 1) ? 0 : 0.5; + file >> reading_mode; + file >> KD_params_string; + hashingModes hash_mode = static_cast(hashing_mode); + readingModes slicing_mode = static_cast(reading_mode); + map kmerDecoder_params = kmerDecoder::string_to_params(KD_params_string); bufferedMQF* bufferedmqf=new bufferedMQF(); bufferedMQF_deserialize(bufferedmqf,(filePath+".bmqf").c_str()); - return new kDataFrameBMQF(bufferedmqf,filekSize, flasePositiveRate); + return new kDataFrameBMQF(bufferedmqf, slicing_mode, hash_mode, kmerDecoder_params); } kDataFrameIterator kDataFrameBMQF::begin(){ diff --git a/src/kDataFrames/kDataFrameMAP.cpp b/src/kDataFrames/kDataFrameMAP.cpp index d55228c7..d41120c4 100644 --- a/src/kDataFrames/kDataFrameMAP.cpp +++ b/src/kDataFrames/kDataFrameMAP.cpp @@ -123,6 +123,12 @@ kDataFrameMAP::kDataFrameMAP() { // this->hasher = (new IntegerHasher(23)); } +kDataFrameMAP::kDataFrameMAP(readingModes RM, hashingModes HM, map params) { + this->class_name = "MAP"; // Temporary until resolving #17 + KD = kmerDecoder::getInstance(RM, HM, params); + this->kSize = KD->get_kSize(); +} + bool kDataFrameMAP::kmerExist(string kmerS) { return (this->MAP.find(kmer::str_to_canonical_int(kmerS)) == this->MAP.end()) ? 0 : 1; } @@ -216,8 +222,11 @@ void kDataFrameMAP::serialize(string filePath) { // Write the kmerSize ofstream file(filePath + ".extra"); file << kSize << endl; - file << 2 << endl; + file << this->KD->hash_mode << endl; + file << this->KD->slicing_mode << endl; + file << this->KD->params_to_string() << endl; file.close(); + std::ofstream os(filePath + ".map", std::ios::binary); cereal::BinaryOutputArchive archive(os); archive(this->MAP); @@ -227,19 +236,22 @@ void kDataFrameMAP::serialize(string filePath) { kDataFrame *kDataFrameMAP::load(string filePath) { // Load kSize - ifstream file(filePath + ".extra"); uint64_t kSize; - int hashing_mode; + int hashing_mode, reading_mode; + string KD_params_string; + + ifstream file(filePath + ".extra"); file >> kSize; file >> hashing_mode; + file >> reading_mode; + file >> KD_params_string; - if(hashing_mode != 2){ - std::cerr << "Error: In the kDataFrameMAP, hashing must be 2:TwoBitsRepresentation mode" << endl; - exit(1); - } + hashingModes hash_mode = static_cast(hashing_mode); + readingModes slicing_mode = static_cast(reading_mode); + map kmerDecoder_params = kmerDecoder::string_to_params(KD_params_string); file.close(); // Initialize kDataFrameMAP - kDataFrameMAP *KMAP = new kDataFrameMAP(kSize); + kDataFrameMAP *KMAP = new kDataFrameMAP(slicing_mode, hash_mode, kmerDecoder_params); // Load the hashMap into the kDataFrameMAP std::ifstream os(filePath + ".map", std::ios::binary); diff --git a/src/kDataFrames/kDataFrameMQF.cpp b/src/kDataFrames/kDataFrameMQF.cpp index 7435aad7..467b397a 100644 --- a/src/kDataFrames/kDataFrameMQF.cpp +++ b/src/kDataFrames/kDataFrameMQF.cpp @@ -5,6 +5,7 @@ #include #include #include +#include "restriction_tags.hpp" /* @@ -145,6 +146,10 @@ kDataFrameIterator kDataFrameMQF::find(uint64_t kmer) { kDataFrameMQF::kDataFrameMQF() : kDataFrame() { this->class_name = "MQF"; // Temporary until resolving #17 + kDataFrame_tags = new tag({{"min_kSize", 17}}); // Add all restrictions at once + kDataFrame_tags->add_restriction("max_kSize", 31); // Add a new restriction + kDataFrame_tags->add_restriction("sorted", true); + mqf = new QF(); qf_init(mqf, (1ULL << 16), 2 * kSize, 0, 2, 32, true, "", 2038074761); KD = (new Kmers(kSize)); @@ -261,6 +266,26 @@ kDataFrameMQF::kDataFrameMQF(QF *mqf, uint64_t ksize, double falsePositiveRate) it->endIterator(); endIterator=new kDataFrameIterator(it,(kDataFrame*)this); } + + +kDataFrameMQF::kDataFrameMQF(QF *mqf, readingModes slicing_mode, hashingModes hash_mode, map kmerDecoder_params){ + this->class_name = "MQF"; // Temporary until resolving #17 + this->mqf = mqf; + this->falsePositiveRate = falsePositiveRate; + if (falsePositiveRate == 0) { + KD = (new Kmers(kSize, integer_hasher)); + } else if (falsePositiveRate < 1) { + KD = (new Kmers(kSize, mumur_hasher)); + } + hashbits = this->mqf->metadata->key_bits; + hashbits = 2 * kSize; + range = (1ULL << hashbits); + kDataFrameMQFIterator *it = new kDataFrameMQFIterator(mqf, kSize, KD); + it->endIterator(); + endIterator = new kDataFrameIterator(it,(kDataFrame*)this); +} + + kDataFrameMQF::kDataFrameMQF(uint64_t ksize, vector countHistogram, uint8_t tagSize, double falsePositiveRate) : kDataFrame(ksize) { @@ -542,34 +567,36 @@ float kDataFrameMQF::max_load_factor() { void kDataFrameMQF::serialize(string filePath) { - //filePath += ".mqf"; ofstream file(filePath + ".extra"); file << kSize << endl; file << this->KD->hash_mode << endl; + file << this->KD->slicing_mode << endl; + file << this->KD->params_to_string() << endl; file.close(); - // uint64_t legendSize=tagsLegend.size(); - // file<first<<" "<second<> filekSize; + file >> kSize; file >> hashing_mode; + file >> reading_mode; + file >> KD_params_string; + + hashingModes hash_mode = static_cast(hashing_mode); + readingModes slicing_mode = static_cast(reading_mode); + map kmerDecoder_params = kmerDecoder::string_to_params(KD_params_string); + double flasePositiveRate; flasePositiveRate = (hashing_mode == 1) ? 0 : 0.5; file.close(); QF *mqf = new QF(); qf_deserialize(mqf, (filePath + ".mqf").c_str()); - return new kDataFrameMQF(mqf, filekSize, flasePositiveRate); + return new kDataFrameMQF(mqf, slicing_mode, hash_mode, kmerDecoder_params); } void kDataFrameMQF::preprocessKmerOrder() diff --git a/src/kDataFrames/kDataFramePHMAP.cpp b/src/kDataFrames/kDataFramePHMAP.cpp index 1c921cad..1a6ee09c 100644 --- a/src/kDataFrames/kDataFramePHMAP.cpp +++ b/src/kDataFrames/kDataFramePHMAP.cpp @@ -237,7 +237,8 @@ void kDataFramePHMAP::serialize(string filePath) { ofstream file(filePath + ".extra"); file << kSize << endl; file << this->KD->hash_mode << endl; - file.close(); + file << this->KD->slicing_mode << endl; + file << this->KD->params_to_string() << endl; filePath += ".phmap"; { phmap::BinaryOutputArchive ar_out(filePath.c_str()); @@ -249,15 +250,23 @@ void kDataFramePHMAP::serialize(string filePath) { kDataFrame *kDataFramePHMAP::load(string filePath) { // Load kSize + int kSize, hashing_mode, reading_mode; + string KD_params_string; + ifstream file(filePath + ".extra"); - uint64_t kSize; - int hashing_mode; file >> kSize; file >> hashing_mode; + file >> reading_mode; + file >> KD_params_string; file.close(); - filePath += ".phmap"; + hashingModes hash_mode = static_cast(hashing_mode); - kDataFramePHMAP *KMAP = new kDataFramePHMAP(kSize, hash_mode); + readingModes slicing_mode = static_cast(reading_mode); + map kmerDecoder_params = kmerDecoder::string_to_params(KD_params_string); + + filePath += ".phmap"; + + kDataFramePHMAP *KMAP = new kDataFramePHMAP(slicing_mode, hash_mode, kmerDecoder_params); { phmap::BinaryInputArchive ar_in(filePath.c_str()); KMAP->MAP.load(ar_in); diff --git a/src/restriction_tags.cpp b/src/restriction_tags.cpp new file mode 100644 index 00000000..87fea821 --- /dev/null +++ b/src/restriction_tags.cpp @@ -0,0 +1,53 @@ +#include "restriction_tags.hpp" + +intFuncMap tagToFunc; + +tag::tag(map tags) { + for (auto & tag_info: tags) { + string tag_name = tag_info.first; + int tag_value = tag_info.second; + add_restriction(tag_name, tag_value); + } + + tagToFunc["min_kSize"] = tag_min_kSize; + tagToFunc["max_kSize"] = tag_max_kSize; +} + +void tag::add_restriction(string tag_name, int value) { + bool valid_restriction = (restrictions.find(tag_name) != restrictions.end()); + if (valid_restriction) { + restrictions[tag_name] = value; + active_tags.push_back(tag_name); + } else throw std::invalid_argument("(" + tag_name + ") is not recognized."); +} + +// Checks for rules violation in active_tags +void tag::check_restrictions() { + // Check for missing tags + for (auto & active_tag: active_tags) { + int tag_value = restrictions[active_tag]; + std::string + function = std::string(active_tag); + + // Call the validation function + intFuncMap::iterator tagFunc = tagToFunc.find(function); + if (tagFunc != tagToFunc.end()) { + tag m; + (m.*(tagFunc -> second))(tag_value); + } + } +} + +// Check functions + +void tag::tag_min_kSize(int value) { + if (value < restrictions["min_kSize"]) { + throw std::logic_error("kSize must: " + to_string(restrictions["min_kSize"]) + " < kSize > " + to_string(restrictions["max_kSize"])); + } +} + +void tag::tag_max_kSize(int value) { + if (value > restrictions["max_kSize"]) { + throw std::logic_error("kSize must: " + to_string(restrictions["min_kSize"]) + " < kSize > " + to_string(restrictions["max_kSize"])); + } +} \ No newline at end of file