Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25,526 changes: 25,526 additions & 0 deletions libtiledbvcf/external/nlohmann/json.hpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions libtiledbvcf/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ set(TILEDB_VCF_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/utils/normalize.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/sample_utils.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/utils.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/variant_utils.cc
${CMAKE_CURRENT_SOURCE_DIR}/vcf/bed_file.cc
${CMAKE_CURRENT_SOURCE_DIR}/vcf/region.cc
${CMAKE_CURRENT_SOURCE_DIR}/vcf/vcf_merger.cc
Expand Down
102 changes: 102 additions & 0 deletions libtiledbvcf/src/cli/tiledbvcf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,34 @@ void do_store(const IngestionParams& args, const CLI::App& cmd) {
LOG_TRACE("Finished store command.");
}

/** Store/ingest a single variant. */
void do_variant(const IngestionParams& args, const CLI::App& cmd) {
if (args.variant_json.size() == 0) {
std::cerr
<< "ERROR: RequiredError: --json is required\n";
throw CLI::CallForHelp();
}

if (args.verbose) {
LOG_SET_LEVEL("debug");
}

LOG_TRACE("Starting add-variant command.");
config_to_log(cmd);

Writer writer;
writer.set_all_params(args);
writer.add_variant();

if (args.tiledb_stats_enabled) {
char* stats;
writer.tiledb_stats(&stats);
std::cout << "TileDB Internal Statistics:" << std::endl;
std::cout << stats << std::endl;
}
LOG_TRACE("Finished add-variant command.");
}

/** Delete. */
void do_delete(const DeleteParams& args, const CLI::App& cmd) {
LOG_TRACE("Starting delete command.");
Expand Down Expand Up @@ -680,6 +708,79 @@ void add_store(CLI::App& app) {
cmd->callback([args, cmd]() { do_store(*args, *cmd); });
}

void add_variant(CLI::App& app) {
auto args = std::make_shared<IngestionParams>();
auto cmd =
app.add_subcommand(
"add-variant",
"Adds a variant to an existing sample in a TileDB-VCF dataset");

cmd->set_help_flag("-h,--help")->group(""); // hide from help message
add_tiledb_uri_option(cmd, args->uri);

cmd->option_defaults()->group("Variant options");
cmd->add_option(
"-j,--json",
args->variant_json,
"JSON describing the variant to add")
->required();
args->variant_tmp_dir = "/dev/shm";
cmd->add_option(
"-d,--tmp-dir",
args->variant_tmp_dir,
"Directory used for temporary file storage")
->default_str(args->variant_tmp_dir);

cmd->option_defaults()->group("TileDB options");
add_tiledb_options(cmd, args->tiledb_config);
cmd->add_flag("--stats", args->tiledb_stats_enabled, "Enable TileDB stats");
cmd->add_flag(
"--stats-vcf-header-array",
args->tiledb_stats_enabled_vcf_header_array,
"Enable TileDB stats for vcf header array usage");

cmd->option_defaults()->group("Contig options");
cmd->add_flag(
"!--disable-contig-fragment-merging",
args->contig_fragment_merging,
"Disable merging of contigs into fragments. Generally contig fragment "
"merging is good, this is a performance optimization to reduce the "
"prefixes on a s3/azure/gcs bucket when there is a large number of "
"pseudo contigs which are small in size.");
cmd->add_option(
"--contigs-to-keep-separate",
args->contigs_to_keep_separate,
"Comma-separated list of contigs that should not be merged "
"into combined fragments. The default list includes all "
"standard human chromosomes in both UCSC (e.g., chr1) and "
"Ensembl (e.g., 1) formats.")
->delimiter(',')
->default_str("")
->excludes("--disable-contig-fragment-merging");
cmd->add_option(
"--contigs-to-allow-merging",
args->contigs_to_allow_merging,
"Comma-separated list of contigs that should be allowed to "
"be merged into combined fragments.")
->delimiter(',')
->excludes("--disable-contig-fragment-merging")
->excludes("--contigs-to-keep-separate");
cmd->add_option(
"--contig-mode",
args->contig_mode,
"Select which contigs are ingested: 'separate', 'merged', or 'all' "
"contigs")
->transform(CLI::CheckedTransformer(contig_mode_map));

cmd->option_defaults()->group("Debug options");
add_logging_options(cmd, args->log_level, args->log_file);
cmd->add_flag("-v,--verbose", args->verbose, "Enable verbose output");
CLI::deprecate_option(cmd, "--verbose", "--log-level debug");

// register function to implement this command
cmd->callback([args, cmd]() { do_variant(*args, *cmd); });
}

void add_delete(CLI::App& app) {
auto args = std::make_shared<DeleteParams>();
auto cmd =
Expand Down Expand Up @@ -964,6 +1065,7 @@ int main(int argc, char** argv) {
add_create(app);
add_register(app);
add_store(app);
add_variant(app);
add_delete(app);
add_export(app);
add_list(app);
Expand Down
6 changes: 6 additions & 0 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2066,6 +2066,12 @@ const char* TileDBVCFDataset::sample_name(const int32_t index) const {
return this->sample_names_[index].data();
}

uint32_t TileDBVCFDataset::sample_id(const std::string& sample_name) {
if (!sample_names_loaded_ && metadata_.version == Version::V4)
load_sample_names_v4();
return metadata_.sample_ids.at(sample_name);
}

std::string TileDBVCFDataset::data_array_uri(
const std::string& root_uri, bool relative, bool legacy) {
char delimiter = legacy ? '-' : '/';
Expand Down
8 changes: 8 additions & 0 deletions libtiledbvcf/src/dataset/tiledbvcfdataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,14 @@ class TileDBVCFDataset {
*/
const char* sample_name(int32_t index) const;

/**
* @brief Get sample id (row index) by name
*
* @param sample_name The name of the sample to get the id for
* @return uint32_t The id for the given sample name
*/
uint32_t sample_id(const std::string& sample_name);

/**
* Writes the given sample header data to the separate sample header array in
* the dataset.
Expand Down
16 changes: 16 additions & 0 deletions libtiledbvcf/src/utils/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@
#if !defined _MSC_VER
#include <unistd.h>
#endif
#include <algorithm>
#include <cerrno>
#include <filesystem>
#include <fstream>
#include <iterator>
#include <mutex>
#include <numeric>
#include <random>
#include <sstream>

#include "htslib_plugin/hfile_tiledb_vfs.h"
#include "utils/logger_public.h"
Expand Down Expand Up @@ -65,6 +69,18 @@ std::vector<std::string> split(
return output;
}

std::string join(
std::vector<std::string> const &strings, const std::string& delim) {
if (strings.empty()) {
return std::string();
}
return std::accumulate(strings.begin() + 1, strings.end(), strings[0],
[&delim](std::string a, std::string b) {
return a + delim + b;
}
);
}

void enable_pretty_print_numbers(std::ostream& os) {
struct custom_punct : std::numpunct<char> {
char do_thousands_sep() const {
Expand Down
10 changes: 10 additions & 0 deletions libtiledbvcf/src/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,16 @@ std::set<std::string> split_set(const std::string& s, char delim);
*/
std::vector<std::string> split(const std::string& s, char delim);

/**
* @brief
* Joins a vector into a string given some string delimiter.
* @param strings The vector of strings to join
* @param delim The string to use as the delimiter
* @return The string resluting from the join
*/
std::string join(
std::vector<std::string> const &strings, const std::string& delim);

/**
* @tparam T
* @param start_time
Expand Down
126 changes: 126 additions & 0 deletions libtiledbvcf/src/utils/variant_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2025 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

#include "utils/utils.h"
#include "utils/variant_utils.h"

namespace tiledb {
namespace vcf {

/** Dumps JSON representing a string in a manner that excludes quotes. */
inline std::string dump_string(const nlohmann::json& j) {
std::string data;
if (j.is_string()) {
j.get_to(data);
} else {
data = j.dump();
}
return data;
}

Variant::Variant(const std::string& json) {
nlohmann::json j = nlohmann::json::parse(json);
j.at("sample").get_to(sample_name);
j.at("chrom").get_to(chrom);
j.at("pos").get_to(pos);
j.at("id").get_to(id);
j.at("ref").get_to(ref);
j.at("alt").get_to(alt);
j.at("qual").get_to(qual);
set_filter(j.at("filter"));
set_info(j.at("info"));
set_format_and_sample(j.at("format"));
}

Variant::~Variant() {
}

std::string Variant::to_vcf_line() {
std::string pos_str = std::to_string(pos);
std::string qual_str = std::to_string(qual);
std::vector<std::string> columns = {
chrom, pos_str, id, ref, alt, qual_str, filter, info, format, sample
};
std::string delim = "\t";
std::string line = utils::join(columns, delim);

return line;
}

void Variant::to_record(const bcf_hdr_t* hdr, bcf1_t* rec) {
// Convert the variant to a BCF kstring
std::string line_str = to_vcf_line();
size_t length = line_str.length() + 1;
size_t size = sizeof(char) * length;
char* line_cstr = new char[length];
std::strcpy(line_cstr, line_str.c_str());
kstring_t line_kstr = {length, size, line_cstr};

// Convert the string to a BCF record
if (vcf_parse(&line_kstr, hdr, rec) != 0) {
std::string message = fmt::format(
"Failed to convert VCF line to BCF record: \"{}\"",
line_str);
LOG_ERROR(message);
}

// NOTE: vcf_parse cleans up line_cstr
}

void Variant::set_filter(const nlohmann::json& j) {
std::vector<std::string> filters;
j.get_to(filters);
std::string delim = ";";
filter = utils::join(filters, delim);
}

void Variant::set_info(const nlohmann::json& j) {
std::vector<std::string> fields;
std::string data;
for (auto& [key, value] : j.items()) {
data = dump_string(value);
fields.emplace_back(key + "=" + data);
}
std::string delim = ";";
info = utils::join(fields, delim);
}

void Variant::set_format_and_sample(const nlohmann::json& j) {
std::vector<std::string> format_fields;
std::vector<std::string> sample_fields;
std::string data;
for (auto& [key, value] : j.items()) {
format_fields.emplace_back(key);
data = dump_string(value);
sample_fields.emplace_back(data);
}
std::string delim = ":";
format = utils::join(format_fields, delim);
sample = utils::join(sample_fields, delim);
}

} // namespace vcf
} // namespace tiledb
Loading
Loading