diff --git a/bdsg/include/bdsg/overlays/overlay_helper.hpp b/bdsg/include/bdsg/overlays/overlay_helper.hpp index 8485bfa5..caeec671 100644 --- a/bdsg/include/bdsg/overlays/overlay_helper.hpp +++ b/bdsg/include/bdsg/overlays/overlay_helper.hpp @@ -66,10 +66,11 @@ template class OverlayHelper { public: // Handle non-const base graph - T* apply(V* input_graph) { + template + T* apply(V* input_graph, Params&&... params) { auto mutable_overlaid = dynamic_cast(input_graph); if (mutable_overlaid == nullptr) { - overlay = make_unique(input_graph); + overlay = make_unique(input_graph, std::forward(params)...); mutable_overlaid = dynamic_cast(overlay.get()); assert(mutable_overlaid != nullptr); } @@ -78,10 +79,11 @@ class OverlayHelper { } // Handle const base graph - const T* apply(const V* input_graph) { + template + const T* apply(const V* input_graph, Params&&... params) { overlaid = dynamic_cast(input_graph); if (overlaid == nullptr) { - overlay = make_unique(input_graph); + overlay = make_unique(input_graph, std::forward(params)...); overlaid = dynamic_cast(overlay.get()); assert(overlaid != nullptr); } @@ -100,6 +102,7 @@ class OverlayHelper { /// Implementation of overlay helper functionality for when multiple overlays need to be stacked. // There must be a way to generalize with variadic templates // (I had trouble chaining the output of the nested overlays together and getting the types right when trying) +// TODO: Add support for passing overlay constructor arguments through. template class PairOverlayHelper { public: diff --git a/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp b/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp index 275f1c45..e72fc12c 100644 --- a/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp +++ b/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp @@ -25,7 +25,10 @@ using namespace handlegraph; /* * An overlay that adds the PathPositionHandleGraph interface to a static PathHandleGraph - * by augmenting it with compressed index data structures + * by augmenting it with compressed index data structures. + * + * TODO: Make the overlay transparent so that paths hidden in the base graph + * remain accessible through the path metadata queries. */ class PackedPositionOverlay : public PathPositionHandleGraph, public ExpandingOverlayGraph { diff --git a/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp b/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp index 49a00148..1f805ae0 100644 --- a/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp +++ b/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp @@ -18,6 +18,14 @@ using namespace handlegraph; /* * An overlay that adds fast access to paths in addition to allowing path * position queries on them. + * + * TODO: Won't work properly with paths hidden from for_each_path_handle on the + * backing graph, since they won't be indexed but we also won't pass any kind + * of queries through to the backign graph for queries we expect to be able to + * fulfil from the index. Unkike in PackedPositionOverlay, we now expect the + * index to have some path data in it, not just offset tables that we wouldn't + * expect to use for hidden (i,e, haplotype) paths. We should make the overlay + * transparent so hidden paths work properly, or remove hidden paths. */ class PackedReferencePathOverlay : public PackedPositionOverlay { diff --git a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp index 1164e59c..d9547d03 100644 --- a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp +++ b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp @@ -9,6 +9,7 @@ #define BDSG_REFERENCE_PATH_OVERLAY_HPP_INCLUDED #include +#include #include #include @@ -25,12 +26,27 @@ using namespace handlegraph; * An overlay that adds fast access to paths in addition to allowing path * position queries on them. The original graph's handle_t's and path_handle_t's * remain valid for the overlay, but not the step_t's. + * + * Note that paths that are not indexed as reference paths (i.e. those that are + * hidden from for_each_path_handle by the backing graph and not listed in + * extra_path_names on construction) *will not be accessible through the + * overlay*! You can ask if they exist, but trying to get handles to steps on + * them will not work. To actually look at them you will need to go back to the + * base graph. + * + * TODO: Make the overlay transparent so that paths that don't get indexed in + * the overlay remain accessible but without the (fast versions of?) the + * position queries. */ class ReferencePathOverlay : public PathPositionHandleGraph { public: - ReferencePathOverlay(const PathHandleGraph* graph); + /// Create a ReferencePathOverlay indexing all non-hidden paths in the + /// backing graph (which show up in for_each_path_handle()). For path names + /// in extra_path_names, look them up and index them too, even if they are + /// hidden. + ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set& extra_path_names = {}); ReferencePathOverlay() = default; ~ReferencePathOverlay() = default; diff --git a/bdsg/src/reference_path_overlay.cpp b/bdsg/src/reference_path_overlay.cpp index 8c69da41..d6692c13 100644 --- a/bdsg/src/reference_path_overlay.cpp +++ b/bdsg/src/reference_path_overlay.cpp @@ -6,25 +6,56 @@ #include #include +#include namespace bdsg { using namespace std; using namespace handlegraph; -ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph(graph) { +ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set& extra_path_names) : graph(graph) { - // init the base hash table and gather path handles - uint64_t max_path_handle = 0; - std::vector path_handles; + // Get step counts for all paths we want to process, once. + std::unordered_map cached_step_counts; graph->for_each_path_handle([&](const path_handle_t& path) { + // Find and measure all the non-hidden paths. + // TODO: If we made the overlay transparent so we could access paths + // that didn't get indexed, we wouldn't be weirdly indexing haplotype + // paths from backends that don't hide them in the "reference" path + // overlay. + cached_step_counts[path] = graph->get_step_count(path); + }); + for (auto& path_name : extra_path_names) { + // Also index hidden paths that the user is asking for by name. + if (graph->has_path(path_name)) { + // The graph actually has this path. + path_handle_t path = graph->get_path_handle(path_name); + auto found = cached_step_counts.find(path); + if (found == cached_step_counts.end()) { + // And it's not already reference sense. + // Count steps and remember it + cached_step_counts.emplace_hint(found, path, graph->get_step_count(path)); + } + } + } + + // Now use the cache as a source of truth and make a vector of the paths. + std::vector path_handles; + // We also track the numerically max path handle + uint64_t max_path_handle = 0; + for (auto& handle_and_length : cached_step_counts) { + const path_handle_t& path = handle_and_length.first; path_handles.push_back(path); + + // Each of the paths needs a PathRecord reference_paths.insert(pair(path, PathRecord())); + // And needs to be maxed into the max handles. max_path_handle = std::max(max_path_handle, handlegraph::as_integer(path)); - }); + } + // sort in descending order by length to limit parallel scheduling makespan std::sort(path_handles.begin(), path_handles.end(), [&](path_handle_t a, path_handle_t b) { - return graph->get_step_count(a) > graph->get_step_count(b); + return cached_step_counts.at(a) > cached_step_counts.at(b); }); std::vector> num_steps(graph->max_node_id() + 1); @@ -35,7 +66,7 @@ ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph auto& path_record = reference_paths.at(path); // init the step vectors - size_t path_size = graph->get_step_count(path); + size_t path_size = cached_step_counts.at(path); path_record.steps.resize(path_size); // record the steps and the path length