Skip to content

Commit

Permalink
Refactor to remove a vector
Browse files Browse the repository at this point in the history
  • Loading branch information
m09526 committed Nov 14, 2024
1 parent 81cca48 commit 2f03492
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 24 deletions.
4 changes: 2 additions & 2 deletions cpp/include/cmdline/lub.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once
#include <cudf/table/table_view.hpp>
#include <cudf/io/types.hpp>

#include <cstddef>
#include <vector>

::size_t findLeastUpperBound(std::vector<cudf::table_view> const &views, ::size_t const colNo = 0);
::size_t findLeastUpperBound(std::vector<cudf::io::table_with_metadata> const &tables, ::size_t const colNo = 0);
3 changes: 2 additions & 1 deletion cpp/include/cmdline/slice.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cudf/io/types.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/table/table_view.hpp>

Expand All @@ -9,4 +10,4 @@
int convertInteger(cudf::scalar const &scalar);

std::pair<std::vector<cudf::table_view>, std::vector<cudf::table_view>> splitAtNeedle(cudf::table_view const &needle,
std::vector<cudf::table_view> const &haystacks);
std::vector<cudf::io::table_with_metadata> const &haystacks);
15 changes: 5 additions & 10 deletions cpp/src/cmdline/chunk_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ ::size_t calcRowsWritten(auto const &readers) noexcept {
}

[[nodiscard]] cudf::io::table_metadata grabMetaData(std::string const &file) {
auto opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info(file)).build();
auto opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info(file)).num_rows(1).build();
return cudf::io::read_parquet(opts).metadata;
}

Expand Down Expand Up @@ -141,21 +141,16 @@ int main(int argc, char **argv) {

// Merge and write tables
if (lastTotalRowCount > 0) {
std::vector<cudf::table_view> views;
views.reserve(tables.size());
for (auto const &table : tables) { views.push_back(*table.tbl); }

// Find the least upper bound in sort column across these tables
auto const leastUpperBound = findLeastUpperBound(views, 0);
auto const leastUpperBound = findLeastUpperBound(tables, 0);

// Now take search "needle" from last row from of table with LUB
auto const lubTable = views[leastUpperBound].select({ 0 });
auto const lubTable = tables[leastUpperBound].tbl->select({ 0 });
auto const needle = cudf::split(lubTable, { lubTable.num_rows() - 1 })[1];
auto const tableVectors = splitAtNeedle(needle, views);
auto const tableVectors = splitAtNeedle(needle, tables);

SPDLOG_INFO("Merging {:d} rows", lastTotalRowCount);
auto merged = cudf::merge(views, { 0 }, { cudf::order::ASCENDING });
views.clear();
auto merged = cudf::merge(tableVectors.first, { 0 }, { cudf::order::ASCENDING });
tables.clear();
writer.write(*merged);
auto const elapsedTime = std::chrono::duration_cast<std::chrono::seconds>(timestamp() - startTime);
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/cmdline/lub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@
#include <type_traits>
#include <utility>

::size_t findLeastUpperBound(std::vector<cudf::table_view> const &views, ::size_t const colNo) {
::size_t findLeastUpperBound(std::vector<cudf::io::table_with_metadata> const &tables, ::size_t const colNo) {

auto action = [&views, &colNo]<typename T>() {
auto action = [&tables, &colNo]<typename T>() {
using CudfScalarType = cudf::scalar_type_t<T>;
::size_t lubTableIndex = 0;
std::unique_ptr<cudf::scalar> currentLub =
cudf::get_element(views.front().column(colNo), views.front().column(colNo).size() - 1);
cudf::get_element(tables.front().tbl->get_column(colNo), tables.front().tbl->get_column(colNo).size() - 1);
// Loop over each table view, grab the last element in the sort column and find the lowest
for (::size_t idx = 0; cudf::table_view const &view : views) {
for (::size_t idx = 0; cudf::io::table_with_metadata const &table : tables) {

std::unique_ptr<cudf::scalar> lastElement =
cudf::get_element(view.column(colNo), view.column(colNo).size() - 1);
cudf::get_element(table.tbl->view().column(colNo), table.tbl->view().column(colNo).size() - 1);
auto const lub_ptr = static_cast<CudfScalarType *>(currentLub.get());
auto const lastElement_ptr = static_cast<CudfScalarType *>(lastElement.get());

Expand Down Expand Up @@ -58,6 +58,6 @@ ::size_t findLeastUpperBound(std::vector<cudf::table_view> const &views, ::size_
return lubTableIndex;
};

CUDF_EXPECTS(!views.empty(), "vector of tables cannot be empty");
return cudf::type_dispatcher(views.front().column(colNo).type(), action);
CUDF_EXPECTS(!tables.empty(), "vector of tables cannot be empty");
return cudf::type_dispatcher(tables.front().tbl->get_column(colNo).type(), action);
}
8 changes: 4 additions & 4 deletions cpp/src/cmdline/slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ int convertInteger(cudf::scalar const &scalar) {
}

std::pair<std::vector<cudf::table_view>, std::vector<cudf::table_view>> splitAtNeedle(cudf::table_view const &needle,
std::vector<cudf::table_view> const &haystacks) {
std::vector<cudf::io::table_with_metadata> const &haystacks) {
std::vector<cudf::table_view> tablesToMerge;
std::vector<cudf::table_view> remainingFragments;
tablesToMerge.reserve(haystacks.size());
remainingFragments.reserve(haystacks.size());

// Split each table at the point of that needle
for (::size_t idx = 0; auto const &view : haystacks) {
for (::size_t idx = 0; auto const &table : haystacks) {
// Find needle in each table view, table is "haystack"
std::unique_ptr<cudf::column> splitPoint =
cudf::upper_bound(view.select({ 0 }), needle, { cudf::order::ASCENDING }, { cudf::null_order::AFTER });
cudf::upper_bound(table.tbl->select({ 0 }), needle, { cudf::order::ASCENDING }, { cudf::null_order::AFTER });
CUDF_EXPECTS(splitPoint->size() == 1, "Split result should be single row");
// Get this index back to host
std::unique_ptr<cudf::scalar> splitIndex = cudf::get_element(*splitPoint, 0);
int const splitPos = convertInteger(*splitIndex);
// Now split this table at that index
std::vector<cudf::table_view> splitTables = cudf::split(view, { splitPos });
std::vector<cudf::table_view> splitTables = cudf::split(*table.tbl, { splitPos });
CUDF_EXPECTS(splitTables.size() == 2, "Should be two tables from split");
SPDLOG_INFO(
"File {:d} Table size after split {:d} and {:d}", idx, splitTables[0].num_rows(), splitTables[1].num_rows());
Expand Down

0 comments on commit 2f03492

Please sign in to comment.