From 28373b8a638590e0ddbebe2c5f117bd017d46c0e Mon Sep 17 00:00:00 2001 From: liur34 Date: Fri, 2 Aug 2019 00:20:22 -0700 Subject: [PATCH] add inline comments --- neusomatic/include/bam_variant.hpp | 2 ++ neusomatic/include/change_coordinates.hpp | 3 +++ neusomatic/include/msa.hpp | 19 +++++++++++++- neusomatic/include/msa_utils.hpp | 30 ++++++++++++++++------- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/neusomatic/include/bam_variant.hpp b/neusomatic/include/bam_variant.hpp index 7ac5057..048f73b 100644 --- a/neusomatic/include/bam_variant.hpp +++ b/neusomatic/include/bam_variant.hpp @@ -1,6 +1,8 @@ #ifndef BAM_VARIANT_HPP #define BAM_VARIANT_HPP +// For an alignment record, get the variants based on the MD tags. + #include #include #include "variant.hpp" diff --git a/neusomatic/include/change_coordinates.hpp b/neusomatic/include/change_coordinates.hpp index a126c1c..d895396 100644 --- a/neusomatic/include/change_coordinates.hpp +++ b/neusomatic/include/change_coordinates.hpp @@ -1,6 +1,9 @@ #ifndef CHANGE_COORDINATES_HPP #define CHANGE_COORDINATES_HPP +// Given a reference string in gapped space (reference bases seperated by 0 or more gaps), +// This class can calculate the gapped postion based on a ungapped postion, or vice versa. + #include "SeqanUtils.h" namespace neusomatic { diff --git a/neusomatic/include/msa.hpp b/neusomatic/include/msa.hpp index e0086cb..8b28f3c 100644 --- a/neusomatic/include/msa.hpp +++ b/neusomatic/include/msa.hpp @@ -162,6 +162,11 @@ class ReadSeqGap { template class MSABuilder { + /* + * Building a MSA from pairwise bam alignment. + * This requires reading a pileup of alignment, usually in BAM format + * Each MSABuilder contains intrinsiclly a convertion from nongapped space to gapped space. + */ const std::vector& bam_records_; //std::vector rids_; GappedSeq ref_gaps_; @@ -217,6 +222,9 @@ class MSABuilder { auto GetGappedSeqAndQual(const BamRecord& r) const { + //Initialize a seq of MSA row length with ~(missing char). Replace ~ with -(gap char) between the begin and + //end pos (in gapped space) of a alignment. Iterate through the cigars, fill in the seq with read bases based + //on the cigars. Some idea for qual. std::string msa_bases(ncol(), missing_chr_); std::string msa_bquals(ncol(), 33); @@ -279,7 +287,7 @@ class MSABuilder { } read_pos += c->Length(); } - if (c->Type() == 'D') { + if (c->Type() == 'D') { // for deletion qual, always use one base before. const int l = std::max(ref_pos, ref_gaps_.left()); const int r = std::min(int32_t(ref_pos + c->Length()), ref_gaps_.right()); if (l < r) { @@ -308,6 +316,9 @@ class MSABuilder { } MSABuilder(const GInv& ginv, const std::vector& bams, const std::string& refstr): + //Given a reference seq (nongapped) and a set of bam records, set up the MSA + //reference in gapped space. For each ref pos, set up the gap length as the longest + //insertion length of the pileup. bam_records_(bams), ref_gaps_(refstr, ginv) { BuildRefGap(bams, ref_gaps_); ncol_ = ref_gaps_.length(); @@ -329,7 +340,12 @@ class MSABuilder { } } + //Deprecated std::vector GetMSA() const { + // For each bam record, first get all variants. And then use the variants to + // To get the variants, it uses the MD tags which may or may not exist depends on the aligner. + // replace the reference bases if applies. + // Return a MSA matrix as vector of strings std::vector result(bam_records_.size()); for (size_t i = 0; i < bam_records_.size(); ++i) { auto const& r = bam_records_[i]; @@ -341,6 +357,7 @@ class MSABuilder { } auto GetClipping(const BamRecord& r) const { + // Return the clipping positions. If no clipping return -1. int lsc = -1, rsc = -1; auto const& cigar = r.GetCigar(); auto front_cigar=cigar.front().Type(); diff --git a/neusomatic/include/msa_utils.hpp b/neusomatic/include/msa_utils.hpp index 8ac2278..d4b7a09 100644 --- a/neusomatic/include/msa_utils.hpp +++ b/neusomatic/include/msa_utils.hpp @@ -10,11 +10,18 @@ namespace neusomatic{ class Col{ + /* + * Each Col is a reduced representation of a MSA column, + * i.e., counts of A,T,C,G,-,~, with averaged base qualities and etc.. + * A,C,G,T-(gap),~(missing) are coded as 0,1,2,3,4,5 + * Therefore, we can use a vector to store the mean or the freq for different base, the accessed by the indices. + * For instance, base_freq_[2] is the occurance (maybe normalized by depth) of G in a MSA col. + */ private: using Idx = unsigned; using Base = int; - std::vector bases_; - std::vector bquals_; + std::vector bases_; // deprecated, no longer storing individual base + std::vector bquals_; // deprecated, no longer storing individual qual public: static const int ALPHABET_SIZE = 6; // a, t, c, g, gap and missing_char @@ -34,32 +41,33 @@ class Col{ std::vector lsc_mean; std::vector rsc_mean; std::vector> tag_mean; - decltype(auto) bases() const {return (bases_);} - decltype(auto) bquals() const {return (bquals_);} + decltype(auto) bases() const {return (bases_);} // deprecated, see above + decltype(auto) bquals() const {return (bquals_);} // deprecated, see above - void emplace(const Idx& id, const Base& b) { + void emplace(const Idx& id, const Base& b) { // deprecated, see above bases_[id] = b; } - void emplace_qual(const Idx& id, const int& b) { + void emplace_qual(const Idx& id, const int& b) {// deprecated, see above bquals_[id] = b; } - decltype(auto) at(Idx i) const { + decltype(auto) at(Idx i) const {// deprecated, see above return bases_.at(i); } - decltype(auto) size() const { + decltype(auto) size() const {// deprecated, see above return bases_.size(); } - decltype(auto) operator[](Idx i) { + decltype(auto) operator[](Idx i) {// deprecated, see above return (bases_[i]); } }; template class CondensedArray{ + //CondenseArray is an array of Col (ColSpace). public: using Idx = unsigned; static const unsigned char missing_chr_ = '~'; @@ -130,8 +138,10 @@ class CondensedArray{ CondensedArray() : nrow_(0) {} + //deprecated. template explicit CondensedArray(const std::vector& msa, const int& total_cov, const GInv& ginv, const std::string& refgap) : + //construct a condensed array without qualities and other info from a vector of base strings. nrow_(msa.size()), cspace_(msa[0].size(), Col(msa.size())), @@ -205,8 +215,10 @@ class CondensedArray{ } + //deprecated template explicit CondensedArray(const std::vector& msa, const std::vector& bqual, + //construct a condensed array with qualities and other info from multiple inputs. const std::vector& mqual, const std::vector& strand, const std::vector& lsc, const std::vector& rsc, const std::vector>& tags,