Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subcommand to mask sequences #4481

Merged
merged 4 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/libbdsg
Submodule libbdsg updated 38 files
+9 −4 CMakeLists.txt
+2 −2 bdsg/cmake_bindings/bdsg.cpp
+1 −1 bdsg/cmake_bindings/bdsg.sources
+77 −75 bdsg/cmake_bindings/bdsg/graph_proxy.cpp
+70 −68 bdsg/cmake_bindings/bdsg/graph_proxy_1.cpp
+76 −74 bdsg/cmake_bindings/bdsg/internal/base_packed_graph.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/eades_algorithm.cpp
+26 −5 bdsg/cmake_bindings/bdsg/internal/hash_map.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/is_single_stranded.cpp
+21 −21 bdsg/cmake_bindings/bdsg/internal/mapped_structs_1.cpp
+3 −3 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay.cpp
+6 −7 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay_1.cpp
+14 −3 bdsg/cmake_bindings/bdsg/overlays/path_position_overlays.cpp
+1 −3 bdsg/cmake_bindings/bdsg/overlays/path_subgraph_overlay.cpp
+1,121 −0 bdsg/cmake_bindings/bdsg/overlays/reference_path_overlay.cpp
+1 −1 bdsg/cmake_bindings/bdsg/overlays/strand_split_overlay.cpp
+516 −116 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays.cpp
+0 −817 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays_1.cpp
+59 −7 bdsg/cmake_bindings/bdsg/packed_graph.cpp
+42 −35 bdsg/cmake_bindings/bdsg/snarl_distance_index.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/expanding_overlay_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/handle_graph.cpp
+15 −1 bdsg/cmake_bindings/handlegraph/mutable_path_metadata.cpp
+28 −2 bdsg/cmake_bindings/handlegraph/mutable_path_mutable_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_metadata.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_position_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/trivially_serializable.cpp
+4 −0 bdsg/cmake_bindings/std/bdsg/internal/binder_hook_bind.cpp
+1 −1 bdsg/deps/libhandlegraph
+6 −1 bdsg/include/bdsg/hash_graph.hpp
+36 −0 bdsg/include/bdsg/internal/base_packed_graph.hpp
+8 −0 bdsg/include/bdsg/internal/binder_hook_bind.hpp
+8 −0 bdsg/include/bdsg/internal/graph_proxy_mutable_path_deletable_handle_graph_fragment.classfragment
+70 −20 bdsg/include/bdsg/snarl_distance_index.hpp
+7 −0 bdsg/src/hash_graph.cpp
+8 −1 bdsg/src/test_libbdsg.cpp
+38 −7 make_and_run_binder.py
395 changes: 395 additions & 0 deletions src/masker.cpp

Large diffs are not rendered by default.

57 changes: 57 additions & 0 deletions src/masker.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef VG_MASKER_HPP_INCLUDED
#define VG_MASKER_HPP_INCLUDED

#include <memory>

#include <gbwtgraph/gbwtgraph.h>

#include "handle.hpp"
#include "snarls.hpp"
#include "region.hpp"

namespace vg {

/*
* Class to mask out graph regions with N's
*/
class Masker {

public:

// construct own snarl decomposition
// use external snarl decomposition
Masker(gbwtgraph::GBWTGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker(MutablePathDeletableHandleGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker() = default;
~Masker() = default;

// move
Masker(Masker&& other) = default;
Masker& operator=(Masker&& other) = default;

// replace regions with N's, with regions given as tuples of (path name, region begin, region end).
// regions indexes are assumed to be 0-based and end-exclusive
void mask_sequences(const std::vector<std::tuple<std::string, size_t, size_t>>& regions) const;

private:

// make our own snarl manager
void init_snarls();

// takes tuples of (first step, offset in first step, last step, end in last step)
// uses the lambda function provided to apply mask sequences to the graph
void apply_mask_sequences(const std::vector<std::tuple<step_handle_t, size_t, step_handle_t, size_t>>& mask_intervals,
const std::function<void(handle_t, const std::string&)>& apply_mask) const;

PathHandleGraph* graph = nullptr;
const SnarlManager* snarl_manager = nullptr;

// if not given a snarl manager, we construct one and make the snarl_manager
// point to it
std::unique_ptr<SnarlManager> own_snarl_manager;

};

}

#endif
177 changes: 177 additions & 0 deletions src/subcommand/mask_main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include "subcommand.hpp"
#include "../io/save_handle_graph.hpp"
#include "../handle.hpp"
#include "../xg.hpp"
#include "../utility.hpp"
#include "../snarls.hpp"
#include "../masker.hpp"
#include <vg/io/stream.hpp>
#include <vg/io/vpkg.hpp>
#include <gbwtgraph/gbz.h>

#include <unistd.h>
#include <getopt.h>
#include <sstream>

using namespace vg;
using namespace vg::subcommand;
using namespace vg::io;
using namespace std;

vector<tuple<string, size_t, size_t>> parse_bed(istream& in){

vector<tuple<string, size_t, size_t>> regions;

string line_buffer;
string token_buffer;
while (getline(in, line_buffer)) {

if (line_buffer.empty() || line_buffer.front() == '#' || line_buffer.substr(0, 5) == "track" || line_buffer.substr(0, 7) == "browser") {
// header line
continue;
}
istringstream line_strm(line_buffer);

regions.emplace_back();

getline(line_strm, token_buffer, '\t');
get<0>(regions.back()) = token_buffer;
getline(line_strm, token_buffer, '\t');
get<1>(regions.back()) = parse<int64_t>(token_buffer);
getline(line_strm, token_buffer, '\t');
get<2>(regions.back()) = parse<int64_t>(token_buffer);
}

return regions;
}

void help_mask(char** argv) {
cerr << "usage: " << argv[0] << " [options] <graph>" << endl
<< "Mask out specified regions of a graph with N's" << endl
<< endl
<< "input options: " << endl
<< " -b, --bed FILE BED regions corresponding to path intervals of the graph to target (required)" << endl
<< " -g, --gbz-input Input graph is in GBZ format" << endl
<< " -s, --snarls FILE Snarls from vg snarls (computed directly if not provided)" << endl

<< endl;
}

int main_mask(int argc, char** argv) {

string bed_filepath;
string snarls_filepath;
bool gbz_input = false;

int c;
optind = 2; // force optind past command positional argument
while (true) {
static struct option long_options[] =
{
{"help", no_argument, 0, 'h'},
{"bed", required_argument, 0, 'b'},
{"gbz-input", no_argument, 0, 'g'},
{"snarls", required_argument, 0, 's'},
{0, 0, 0, 0}

};
int option_index = 0;
c = getopt_long (argc, argv, "hb:gs:", long_options, &option_index);

// Detect the end of the options.
if (c == -1)
break;

switch (c)
{

case '?':
case 'h':
help_mask(argv);
return 0;
case 'b':
bed_filepath = optarg;
break;
case 'g':
gbz_input = true;
break;
case 's':
snarls_filepath = optarg;
break;
default:
help_mask(argv);
return 1;
}
}


if (argc - optind != 1) {
cerr << "error: vg mask requires exactly 1 positional argument" << endl;
help_mask(argv);
return 1;
}

if (bed_filepath.empty()) {
cerr << "error: vg mask requires an input BED file from -b / --bed" << endl;
return 1;
}
if (bed_filepath != "-") {
if (!ifstream(bed_filepath)) {
cerr << "error: could not open BED file from " << bed_filepath << endl;
return 1;
}
}
if (!snarls_filepath.empty() && snarls_filepath != "-") {
if (!ifstream(snarls_filepath)) {
cerr << "error: could not open snarls file from " << snarls_filepath << endl;
return 1;
}
}

// load the graph
string graph_path = get_input_file_name(optind, argc, argv);
unique_ptr<MutablePathDeletableHandleGraph> graph;
unique_ptr<gbwtgraph::GBZ> gbz;
if (gbz_input) {
gbz = vg::io::VPKG::load_one<gbwtgraph::GBZ>(graph_path);;
}
else {
graph = vg::io::VPKG::load_one<MutablePathDeletableHandleGraph>(graph_path);
}

// load the BED
vector<tuple<string, size_t, size_t>> regions;
get_input_file(bed_filepath, [&](istream& in) {
regions = std::move(parse_bed(in));
});

// load the snarls
unique_ptr<SnarlManager> snarl_manager;
if (!snarls_filepath.empty()) {
snarl_manager = vg::io::VPKG::load_one<SnarlManager>(snarls_filepath);
}

Masker masker;
if (gbz.get()) {
masker = std::move(Masker(gbz->graph, snarl_manager.get()));
}
else {
masker = std::move(Masker(*graph, snarl_manager.get()));
}

masker.mask_sequences(regions);

// write the graph
if (gbz.get()) {
gbz->simple_sds_serialize(cout);
}
else {
vg::io::save_handle_graph(graph.get(), cout);
}

return 0;
}


// Register subcommand
static Subcommand vg_mask("mask", "Mask out sequences in a graph with N's", main_mask);
Loading
Loading