From fd5c186d04c9e00c088b4b3c98f687b82e3e6e15 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 25 Oct 2024 17:47:04 -0400 Subject: [PATCH 1/3] Allow hinting paths to reference-index --- bdsg/include/bdsg/overlays/overlay_helper.hpp | 11 +++-- .../bdsg/overlays/reference_path_overlay.hpp | 5 ++- bdsg/src/reference_path_overlay.cpp | 41 +++++++++++++++---- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/bdsg/include/bdsg/overlays/overlay_helper.hpp b/bdsg/include/bdsg/overlays/overlay_helper.hpp index 8485bfa5..caeec671 100644 --- a/bdsg/include/bdsg/overlays/overlay_helper.hpp +++ b/bdsg/include/bdsg/overlays/overlay_helper.hpp @@ -66,10 +66,11 @@ template class OverlayHelper { public: // Handle non-const base graph - T* apply(V* input_graph) { + template + T* apply(V* input_graph, Params&&... params) { auto mutable_overlaid = dynamic_cast(input_graph); if (mutable_overlaid == nullptr) { - overlay = make_unique(input_graph); + overlay = make_unique(input_graph, std::forward(params)...); mutable_overlaid = dynamic_cast(overlay.get()); assert(mutable_overlaid != nullptr); } @@ -78,10 +79,11 @@ class OverlayHelper { } // Handle const base graph - const T* apply(const V* input_graph) { + template + const T* apply(const V* input_graph, Params&&... params) { overlaid = dynamic_cast(input_graph); if (overlaid == nullptr) { - overlay = make_unique(input_graph); + overlay = make_unique(input_graph, std::forward(params)...); overlaid = dynamic_cast(overlay.get()); assert(overlaid != nullptr); } @@ -100,6 +102,7 @@ class OverlayHelper { /// Implementation of overlay helper functionality for when multiple overlays need to be stacked. // There must be a way to generalize with variadic templates // (I had trouble chaining the output of the nested overlays together and getting the types right when trying) +// TODO: Add support for passing overlay constructor arguments through. template class PairOverlayHelper { public: diff --git a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp index 1164e59c..a073c616 100644 --- a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp +++ b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp @@ -9,6 +9,7 @@ #define BDSG_REFERENCE_PATH_OVERLAY_HPP_INCLUDED #include +#include #include #include @@ -30,7 +31,9 @@ class ReferencePathOverlay : public PathPositionHandleGraph { public: - ReferencePathOverlay(const PathHandleGraph* graph); + /// Create a ReferencePathOverlay. For paths with names in + /// extra_path_names, index them as if they were reference paths. + ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set& extra_path_names = {}); ReferencePathOverlay() = default; ~ReferencePathOverlay() = default; diff --git a/bdsg/src/reference_path_overlay.cpp b/bdsg/src/reference_path_overlay.cpp index 8c69da41..f4ba79fd 100644 --- a/bdsg/src/reference_path_overlay.cpp +++ b/bdsg/src/reference_path_overlay.cpp @@ -6,25 +6,52 @@ #include #include +#include namespace bdsg { using namespace std; using namespace handlegraph; -ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph(graph) { +ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set& extra_path_names) : graph(graph) { - // init the base hash table and gather path handles - uint64_t max_path_handle = 0; + // Get step counts for all paths we want to process, once. + std::unordered_map cached_step_counts; + graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](const path_handle_t& path) { + // Find and measure all the reference and generic paths. + // TODO: Kick out generic paths? + cached_step_counts[path] = graph->get_step_count(path); + }); + for (auto& path_name : extra_path_names) { + if (graph->has_path(path_name)) { + // The graph actually has this path. + path_handle_t path = graph->get_path_handle(path_name); + auto found = cached_step_counts.find(path); + if (found == cached_step_counts.end()) { + // And it's not already reference sense. + // Count steps and remember it + cached_step_counts.emplace_hint(found, path, graph->get_step_count(path)); + } + } + } + + // Now use the cache as a source of truth and make a vector of the paths. std::vector path_handles; - graph->for_each_path_handle([&](const path_handle_t& path) { + // We also track the numerically max path handle + uint64_t max_path_handle = 0; + for (auto& handle_and_length : cached_step_counts) { + const path_handle_t& path = handle_and_length.first; path_handles.push_back(path); + + // Each of the paths needs a PathRecord reference_paths.insert(pair(path, PathRecord())); + // And needs to be maxed into the max handles. max_path_handle = std::max(max_path_handle, handlegraph::as_integer(path)); - }); + } + // sort in descending order by length to limit parallel scheduling makespan std::sort(path_handles.begin(), path_handles.end(), [&](path_handle_t a, path_handle_t b) { - return graph->get_step_count(a) > graph->get_step_count(b); + return cached_step_counts.at(a) > cached_step_counts.at(b); }); std::vector> num_steps(graph->max_node_id() + 1); @@ -35,7 +62,7 @@ ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph) : graph auto& path_record = reference_paths.at(path); // init the step vectors - size_t path_size = graph->get_step_count(path); + size_t path_size = cached_step_counts.at(path); path_record.steps.resize(path_size); // record the steps and the path length From 88181dbc80835e93de681e84e245c891e250f77c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 10:59:58 -0400 Subject: [PATCH 2/3] Go back to indexing all visible paths as of they were reference, and complain about non-transparent overlays --- .../overlays/packed_path_position_overlay.hpp | 5 ++++- .../bdsg/overlays/reference_path_overlay.hpp | 17 +++++++++++++++-- bdsg/src/reference_path_overlay.cpp | 10 +++++++--- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp b/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp index 275f1c45..e72fc12c 100644 --- a/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp +++ b/bdsg/include/bdsg/overlays/packed_path_position_overlay.hpp @@ -25,7 +25,10 @@ using namespace handlegraph; /* * An overlay that adds the PathPositionHandleGraph interface to a static PathHandleGraph - * by augmenting it with compressed index data structures + * by augmenting it with compressed index data structures. + * + * TODO: Make the overlay transparent so that paths hidden in the base graph + * remain accessible through the path metadata queries. */ class PackedPositionOverlay : public PathPositionHandleGraph, public ExpandingOverlayGraph { diff --git a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp index a073c616..d9547d03 100644 --- a/bdsg/include/bdsg/overlays/reference_path_overlay.hpp +++ b/bdsg/include/bdsg/overlays/reference_path_overlay.hpp @@ -26,13 +26,26 @@ using namespace handlegraph; * An overlay that adds fast access to paths in addition to allowing path * position queries on them. The original graph's handle_t's and path_handle_t's * remain valid for the overlay, but not the step_t's. + * + * Note that paths that are not indexed as reference paths (i.e. those that are + * hidden from for_each_path_handle by the backing graph and not listed in + * extra_path_names on construction) *will not be accessible through the + * overlay*! You can ask if they exist, but trying to get handles to steps on + * them will not work. To actually look at them you will need to go back to the + * base graph. + * + * TODO: Make the overlay transparent so that paths that don't get indexed in + * the overlay remain accessible but without the (fast versions of?) the + * position queries. */ class ReferencePathOverlay : public PathPositionHandleGraph { public: - /// Create a ReferencePathOverlay. For paths with names in - /// extra_path_names, index them as if they were reference paths. + /// Create a ReferencePathOverlay indexing all non-hidden paths in the + /// backing graph (which show up in for_each_path_handle()). For path names + /// in extra_path_names, look them up and index them too, even if they are + /// hidden. ReferencePathOverlay(const PathHandleGraph* graph, const std::unordered_set& extra_path_names = {}); ReferencePathOverlay() = default; ~ReferencePathOverlay() = default; diff --git a/bdsg/src/reference_path_overlay.cpp b/bdsg/src/reference_path_overlay.cpp index f4ba79fd..d6692c13 100644 --- a/bdsg/src/reference_path_overlay.cpp +++ b/bdsg/src/reference_path_overlay.cpp @@ -17,12 +17,16 @@ ReferencePathOverlay::ReferencePathOverlay(const PathHandleGraph* graph, const s // Get step counts for all paths we want to process, once. std::unordered_map cached_step_counts; - graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](const path_handle_t& path) { - // Find and measure all the reference and generic paths. - // TODO: Kick out generic paths? + graph->for_each_path_handle([&](const path_handle_t& path) { + // Find and measure all the non-hidden paths. + // TODO: If we made the overlay transparent so we could access paths + // that didn't get indexed, we wouldn't be weirdly indexing haplotype + // paths from backends that don't hide them in the "reference" path + // overlay. cached_step_counts[path] = graph->get_step_count(path); }); for (auto& path_name : extra_path_names) { + // Also index hidden paths that the user is asking for by name. if (graph->has_path(path_name)) { // The graph actually has this path. path_handle_t path = graph->get_path_handle(path_name); From e98cda26c2522bee80292705494dca8dcffa1c5c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 11:38:26 -0400 Subject: [PATCH 3/3] Commit forgotten comment --- .../bdsg/overlays/packed_reference_path_overlay.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp b/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp index 49a00148..1f805ae0 100644 --- a/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp +++ b/bdsg/include/bdsg/overlays/packed_reference_path_overlay.hpp @@ -18,6 +18,14 @@ using namespace handlegraph; /* * An overlay that adds fast access to paths in addition to allowing path * position queries on them. + * + * TODO: Won't work properly with paths hidden from for_each_path_handle on the + * backing graph, since they won't be indexed but we also won't pass any kind + * of queries through to the backign graph for queries we expect to be able to + * fulfil from the index. Unkike in PackedPositionOverlay, we now expect the + * index to have some path data in it, not just offset tables that we wouldn't + * expect to use for hidden (i,e, haplotype) paths. We should make the overlay + * transparent so hidden paths work properly, or remove hidden paths. */ class PackedReferencePathOverlay : public PackedPositionOverlay {