From 394059317326a64e1bce43a37f8ef4d7edd46364 Mon Sep 17 00:00:00 2001 From: George Powley Date: Sun, 28 Apr 2024 09:57:00 -0400 Subject: [PATCH] Read BED files stored as TileDB files (#712) --- .../io/tiledb/libvcfnative/LibVCFNative.c | 19 ----- .../io/tiledb/libvcfnative/LibVCFNative.h | 9 --- .../io/tiledb/libvcfnative/LibVCFNative.java | 2 - .../io/tiledb/libvcfnative/VCFReader.java | 9 --- .../io/tiledb/libvcfnative/VCFReaderTest.java | 37 +++------ .../src/tiledbvcf/binding/libtiledbvcf.cc | 1 - apis/python/src/tiledbvcf/binding/reader.cc | 5 -- apis/python/src/tiledbvcf/binding/reader.h | 3 - apis/python/src/tiledbvcf/dataset.py | 18 ----- apis/python/tests/test_tiledbvcf.py | 75 +++++++++++++++++- libtiledbvcf/src/c_api/tiledbvcf.cc | 11 --- libtiledbvcf/src/c_api/tiledbvcf.h | 10 --- libtiledbvcf/src/cli/tiledbvcf.cc | 5 -- libtiledbvcf/src/read/reader.cc | 76 +++++++++---------- libtiledbvcf/src/read/reader.h | 4 - libtiledbvcf/src/utils/utils.cc | 11 +++ libtiledbvcf/src/utils/utils.h | 7 ++ libtiledbvcf/src/vcf/region.cc | 7 +- libtiledbvcf/test/src/unit-c-api-reader.cc | 28 ------- 19 files changed, 141 insertions(+), 196 deletions(-) diff --git a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.c b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.c index d85ba9a64..6dddc785e 100644 --- a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.c +++ b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.c @@ -154,25 +154,6 @@ Java_io_tiledb_libvcfnative_LibVCFNative_tiledb_1vcf_1reader_1set_1bed_1file( return rc; } -JNIEXPORT jint JNICALL -Java_io_tiledb_libvcfnative_LibVCFNative_tiledb_1vcf_1reader_1set_1bed_1array( - JNIEnv* env, jclass self, jlong readerPtr, jstring uri) { - (void)self; - tiledb_vcf_reader_t* reader = (tiledb_vcf_reader_t*)readerPtr; - if (reader == 0) { - return TILEDB_VCF_ERR; - } - - const char* c_uri = (*env)->GetStringUTFChars(env, uri, 0); - if (c_uri == NULL) { - return TILEDB_VCF_ERR; - } - - int rc = tiledb_vcf_reader_set_bed_array(reader, c_uri); - (*env)->ReleaseStringUTFChars(env, uri, c_uri); - return rc; -} - JNIEXPORT jint JNICALL Java_io_tiledb_libvcfnative_LibVCFNative_tiledb_1vcf_1reader_1set_1samples( JNIEnv* env, jclass self, jlong readerPtr, jstring samples) { diff --git a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.h b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.h index e192a7b29..070c75aa7 100644 --- a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.h +++ b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.h @@ -52,15 +52,6 @@ JNIEXPORT jint JNICALL Java_io_tiledb_libvcfnative_LibVCFNative_tiledb_1vcf_1reader_1set_1bed_1file( JNIEnv*, jclass, jlong, jstring); -/* - * Class: io_tiledb_libvcfnative_LibVCFNative - * Method: tiledb_vcf_reader_set_bed_array - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL -Java_io_tiledb_libvcfnative_LibVCFNative_tiledb_1vcf_1reader_1set_1bed_1array( - JNIEnv*, jclass, jlong, jstring); - /* * Class: io_tiledb_libvcfnative_LibVCFNative * Method: tiledb_vcf_reader_set_samples diff --git a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.java b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.java index 203996e50..420afec02 100644 --- a/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.java +++ b/apis/java/src/main/java/io/tiledb/libvcfnative/LibVCFNative.java @@ -38,8 +38,6 @@ public class LibVCFNative { public static final native int tiledb_vcf_reader_set_bed_file(long readerPtr, String uri); - public static final native int tiledb_vcf_reader_set_bed_array(long readerPtr, String uri); - public static final native int tiledb_vcf_reader_set_samples(long readerPtr, String samplesCSV); public static final native int tiledb_vcf_reader_set_regions(long readerPtr, String regionsCSV); diff --git a/apis/java/src/main/java/io/tiledb/libvcfnative/VCFReader.java b/apis/java/src/main/java/io/tiledb/libvcfnative/VCFReader.java index 019563bd3..4a9bfb0f8 100644 --- a/apis/java/src/main/java/io/tiledb/libvcfnative/VCFReader.java +++ b/apis/java/src/main/java/io/tiledb/libvcfnative/VCFReader.java @@ -286,15 +286,6 @@ public VCFReader setBedFile(String uri) { return this; } - public VCFReader setBedArray(String uri) { - int rc = LibVCFNative.tiledb_vcf_reader_set_bed_array(readerPtr, uri); - if (rc != 0) { - String msg = getLastErrorMessage(); - throw new RuntimeException("Error setting query bed array '" + uri + "': " + msg); - } - return this; - } - public VCFReader setSortRegions(boolean sortRegions) { int rc = LibVCFNative.tiledb_vcf_reader_set_sort_regions(readerPtr, sortRegions); if (rc != 0) { diff --git a/apis/java/src/test/java/io/tiledb/libvcfnative/VCFReaderTest.java b/apis/java/src/test/java/io/tiledb/libvcfnative/VCFReaderTest.java index 29b71f21a..55aa36653 100644 --- a/apis/java/src/test/java/io/tiledb/libvcfnative/VCFReaderTest.java +++ b/apis/java/src/test/java/io/tiledb/libvcfnative/VCFReaderTest.java @@ -72,8 +72,7 @@ private String[] getSamples() throws IOException { * @return The VCFReader instance * @throws IOException */ - private VCFReader getVFCReader( - Optional inputSamples, Optional bedFile, Optional bedArray) + private VCFReader getVFCReader(Optional inputSamples, Optional bedFile) throws IOException { String samples[] = inputSamples.orElse(getSamples()); @@ -83,8 +82,6 @@ private VCFReader getVFCReader( if (bedFile.isPresent()) reader.setBedFile(bedFile.get()); - if (bedArray.isPresent()) reader.setBedArray(bedArray.get()); - return reader; } @@ -95,7 +92,7 @@ private VCFReader getVFCReader( */ @Test public void testReadCompletes() throws IOException { - VCFReader reader = getVFCReader(Optional.empty(), Optional.empty(), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.empty()); int results = 0; @@ -114,7 +111,7 @@ public void testReadCompletes() throws IOException { */ @Test public void testReaderValues() throws IOException { - VCFReader reader = getVFCReader(Optional.empty(), Optional.empty(), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.empty()); reader.setRanges("1:12100-13360,1:13500-17350".split(",")); @@ -287,7 +284,7 @@ public void testNumResultsMultipleRangePartitions() throws IOException { int[] partitionsToCheck = {1, 2, 5, 10, 50, 100}; for (int numPartitions : partitionsToCheck) { - VCFReader reader = getVFCReader(Optional.empty(), Optional.empty(), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.empty()); int results = 0; @@ -313,9 +310,7 @@ public void testNumResultsMultipleRangePartitions() throws IOException { */ @Test public void testSingleSample() throws IOException { - VCFReader reader = - getVFCReader( - Optional.of(new String[] {getSamples()[0]}), Optional.empty(), Optional.empty()); + VCFReader reader = getVFCReader(Optional.of(new String[] {getSamples()[0]}), Optional.empty()); int results = 0; @@ -334,8 +329,7 @@ public void testSingleSample() throws IOException { */ @Test public void testBEDFile() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.of(constructBEDURI()), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDURI())); int results = 0; @@ -355,8 +349,7 @@ public void testBEDFile() throws IOException { */ @Test public void testBEDArray() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.empty(), Optional.of(constructBEDArrayURI())); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDArrayURI())); int results = 0; @@ -376,8 +369,7 @@ public void testBEDArray() throws IOException { */ @Test public void testSetSingleBuffer() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.of(constructBEDURI()), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDURI())); ByteBuffer data = ByteBuffer.allocateDirect(1024); reader.setBuffer("sample_name", data); @@ -394,16 +386,14 @@ public void testSetSingleBuffer() throws IOException { @Test public void testSetStatsEnabled() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.of(constructBEDURI()), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDURI())); reader.setStatsEnabled(true); } @Test public void testGetStatsEnabled() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.of(constructBEDURI()), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDURI())); Assert.assertFalse(reader.getStatsEnabled()); reader.setStatsEnabled(true); @@ -414,8 +404,7 @@ public void testGetStatsEnabled() throws IOException { @Test public void testStats() throws IOException { - VCFReader reader = - getVFCReader(Optional.empty(), Optional.of(constructBEDURI()), Optional.empty()); + VCFReader reader = getVFCReader(Optional.empty(), Optional.of(constructBEDURI())); reader.setStatsEnabled(true); Assert.assertNotNull(reader.stats()); } @@ -427,9 +416,7 @@ public void testStats() throws IOException { */ @Test public void testAttributes() throws IOException { - VCFReader reader = - getVFCReader( - Optional.of(new String[] {getSamples()[0]}), Optional.empty(), Optional.empty()); + VCFReader reader = getVFCReader(Optional.of(new String[] {getSamples()[0]}), Optional.empty()); Assert.assertTrue(reader.attributes.size() > 0); Assert.assertTrue(reader.fmtAttributes.size() > 0); diff --git a/apis/python/src/tiledbvcf/binding/libtiledbvcf.cc b/apis/python/src/tiledbvcf/binding/libtiledbvcf.cc index 90f38a7b9..3d152c8a2 100644 --- a/apis/python/src/tiledbvcf/binding/libtiledbvcf.cc +++ b/apis/python/src/tiledbvcf/binding/libtiledbvcf.cc @@ -24,7 +24,6 @@ PYBIND11_MODULE(libtiledbvcf, m) { .def("set_samples_file", &Reader::set_samples_file) .def("set_regions", &Reader::set_regions) .def("set_bed_file", &Reader::set_bed_file) - .def("set_bed_array", &Reader::set_bed_array) .def("set_sort_regions", &Reader::set_sort_regions) .def("set_region_partition", &Reader::set_region_partition) .def("set_sample_partition", &Reader::set_sample_partition) diff --git a/apis/python/src/tiledbvcf/binding/reader.cc b/apis/python/src/tiledbvcf/binding/reader.cc index ae8e24282..606e0c21d 100644 --- a/apis/python/src/tiledbvcf/binding/reader.cc +++ b/apis/python/src/tiledbvcf/binding/reader.cc @@ -108,11 +108,6 @@ void Reader::set_bed_file(const std::string& uri) { check_error(reader, tiledb_vcf_reader_set_bed_file(reader, uri.c_str())); } -void Reader::set_bed_array(const std::string& uri) { - auto reader = ptr.get(); - check_error(reader, tiledb_vcf_reader_set_bed_array(reader, uri.c_str())); -} - void Reader::set_region_partition(int32_t partition, int32_t num_partitions) { auto reader = ptr.get(); check_error( diff --git a/apis/python/src/tiledbvcf/binding/reader.h b/apis/python/src/tiledbvcf/binding/reader.h index ceb14b574..8424ceefb 100644 --- a/apis/python/src/tiledbvcf/binding/reader.h +++ b/apis/python/src/tiledbvcf/binding/reader.h @@ -67,9 +67,6 @@ class Reader { /** Sets a URI of a BED file containing regions to include in the read. */ void set_bed_file(const std::string& uri); - /** Sets a URI of a BED array containing regions to include in the read. */ - void set_bed_array(const std::string& uri); - /** Sets the region partition of this reader. */ void set_region_partition(int32_t partition, int32_t num_partitions); diff --git a/apis/python/src/tiledbvcf/dataset.py b/apis/python/src/tiledbvcf/dataset.py index 1f28641f4..7958bfc76 100644 --- a/apis/python/src/tiledbvcf/dataset.py +++ b/apis/python/src/tiledbvcf/dataset.py @@ -210,7 +210,6 @@ def read_arrow( regions: (str, List[str]) = None, samples_file: str = None, bed_file: str = None, - bed_array: str = None, skip_check_samples: bool = False, set_af_filter: str = "", scan_all_samples: bool = False, @@ -236,8 +235,6 @@ def read_arrow( URI of file containing sample names to be read, one per line. bed_file URI of a BED file of genomic regions to be read. - bed_array - URI of a BED array of genomic regions to be read. skip_check_samples Skip checking if the samples in `samples_file` exist in the dataset. set_af_filter @@ -276,9 +273,6 @@ def read_arrow( if bed_file is not None: self.reader.set_bed_file(bed_file) - if bed_array is not None: - self.reader.set_bed_array(bed_array) - return self.continue_read_arrow() def read_variant_stats( @@ -322,7 +316,6 @@ def read( regions: (str, List[str]) = None, samples_file: str = None, bed_file: str = None, - bed_array: str = None, skip_check_samples: bool = False, set_af_filter: str = "", scan_all_samples: bool = False, @@ -350,8 +343,6 @@ def read( URI of file containing sample names to be read, one per line. bed_file URI of a BED file of genomic regions to be read. - bed_array - URI of a BED array of genomic regions to be read. skip_check_samples Skip checking if the samples in `samples_file` exist in the dataset. set_af_filter @@ -388,9 +379,6 @@ def read( if bed_file is not None: self.reader.set_bed_file(bed_file) - if bed_array is not None: - self.reader.set_bed_array(bed_array) - return self.continue_read() def export( @@ -399,7 +387,6 @@ def export( regions: (str, List[str]) = None, samples_file: str = None, bed_file: str = None, - bed_array: str = None, skip_check_samples: bool = False, enable_progress_estimation: bool = False, merge: bool = False, @@ -420,8 +407,6 @@ def export( URI of file containing sample names to be read, one per line. bed_file URI of a BED file of genomic regions to be read. - bed_array - URI of a BED array of genomic regions to be read. skip_check_samples Skip checking if the samples in `samples_file` exist in the dataset. set_af_filter @@ -467,9 +452,6 @@ def export( if bed_file is not None: self.reader.set_bed_file(bed_file) - if bed_array is not None: - self.reader.set_bed_array(bed_array) - self.reader.read() if not self.read_completed(): raise Exception("Unexpected read status during export.") diff --git a/apis/python/tests/test_tiledbvcf.py b/apis/python/tests/test_tiledbvcf.py index d89487c72..0abcf77ac 100755 --- a/apis/python/tests/test_tiledbvcf.py +++ b/apis/python/tests/test_tiledbvcf.py @@ -1573,7 +1573,9 @@ def test_flag_export(tmp_path): assert df["info_DS"].tolist() == expected_ds -def test_bed_array(tmp_path, test_ds_v4): +def test_bed_filestore(tmp_path, test_ds_v4): + tiledbvcf.config_logging("debug") + expected_df = pd.DataFrame( { "sample_name": pd.Series( @@ -1583,6 +1585,73 @@ def test_bed_array(tmp_path, test_ds_v4): "HG00280", "HG01762", "HG00280", + ] + ), + "pos_start": pd.Series( + [ + 12141, + 12141, + 12546, + 12546, + 17319, + ], + dtype=np.int32, + ), + "pos_end": pd.Series( + [ + 12277, + 12277, + 12771, + 12771, + 17479, + ], + dtype=np.int32, + ), + } + ).sort_values(ignore_index=True, by=["sample_name", "pos_start"]) + + # Create BED file + bed_file = os.path.join(tmp_path, "test.bed") + + regions = [ + (1, 12000, 13000), + (1, 17000, 17479), + ] + + with open(bed_file, "w") as f: + for region in regions: + f.write(f"{region[0]}\t{region[1]}\t{region[2]}\n") + + # Create BED filestore from BED file + bed_filestore = os.path.join(tmp_path, "test.bed.filestore") + tiledb.Array.create(bed_filestore, tiledb.ArraySchema.from_file(bed_file)) + tiledb.Filestore.copy_from(bed_filestore, bed_file) + + # Create the dataset + for use_arrow in [False, True]: + func = test_ds_v4.read_arrow if use_arrow else test_ds_v4.read + + df = func(attrs=["sample_name", "pos_start", "pos_end"], bed_file=bed_filestore) + if use_arrow: + df = df.to_pandas() + + print(df) + + _check_dfs( + expected_df, + df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]), + ) + + +def test_bed_array(tmp_path, test_ds_v4): + expected_df = pd.DataFrame( + { + "sample_name": pd.Series( + [ + "HG00280", + "HG01762", + "HG00280", + "HG01762", "HG00280", ] ), @@ -1593,7 +1662,6 @@ def test_bed_array(tmp_path, test_ds_v4): 12546, 12546, 17319, - 17480, ], dtype=np.int32, ), @@ -1604,7 +1672,6 @@ def test_bed_array(tmp_path, test_ds_v4): 12771, 12771, 17479, - 17486, ], dtype=np.int32, ), @@ -1636,7 +1703,7 @@ def test_bed_array(tmp_path, test_ds_v4): for use_arrow in [False, True]: func = test_ds_v4.read_arrow if use_arrow else test_ds_v4.read - df = func(attrs=["sample_name", "pos_start", "pos_end"], bed_array=bed_array) + df = func(attrs=["sample_name", "pos_start", "pos_end"], bed_file=bed_array) if use_arrow: df = df.to_pandas() diff --git a/libtiledbvcf/src/c_api/tiledbvcf.cc b/libtiledbvcf/src/c_api/tiledbvcf.cc index e5c6ddbf0..6dc2dbc94 100644 --- a/libtiledbvcf/src/c_api/tiledbvcf.cc +++ b/libtiledbvcf/src/c_api/tiledbvcf.cc @@ -228,17 +228,6 @@ int32_t tiledb_vcf_reader_set_bed_file( return TILEDB_VCF_OK; } -int32_t tiledb_vcf_reader_set_bed_array( - tiledb_vcf_reader_t* reader, const char* uri) { - if (sanity_check(reader) == TILEDB_VCF_ERR || uri == nullptr) - return TILEDB_VCF_ERR; - - if (SAVE_ERROR_CATCH(reader, reader->reader_->set_bed_array(uri))) - return TILEDB_VCF_ERR; - - return TILEDB_VCF_OK; -} - int32_t tiledb_vcf_reader_set_samples( tiledb_vcf_reader_t* reader, const char* samples) { if (sanity_check(reader) == TILEDB_VCF_ERR || samples == nullptr) diff --git a/libtiledbvcf/src/c_api/tiledbvcf.h b/libtiledbvcf/src/c_api/tiledbvcf.h index 15e83b5c3..c99d6671c 100644 --- a/libtiledbvcf/src/c_api/tiledbvcf.h +++ b/libtiledbvcf/src/c_api/tiledbvcf.h @@ -182,16 +182,6 @@ TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_samples_file( TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_bed_file(tiledb_vcf_reader_t* reader, const char* uri); -/** - * Sets the URI of a BED array containing regions to be read. - * - * @param reader VCF reader object - * @param uri URI of BED array - * @return `TILEDB_VCF_OK` for success or `TILEDB_VCF_ERR` for error. - */ -TILEDBVCF_EXPORT int32_t -tiledb_vcf_reader_set_bed_array(tiledb_vcf_reader_t* reader, const char* uri); - /** * Given a CSV string of sample names, sets the samples to be read. * diff --git a/libtiledbvcf/src/cli/tiledbvcf.cc b/libtiledbvcf/src/cli/tiledbvcf.cc index 936e613bf..56e9b568f 100644 --- a/libtiledbvcf/src/cli/tiledbvcf.cc +++ b/libtiledbvcf/src/cli/tiledbvcf.cc @@ -762,11 +762,6 @@ void add_export(CLI::App& app) { args->regions_file_uri, "File containing regions (BED format)") ->excludes("--regions"); - cmd->add_option( - "-A,--bed-array", - args->bed_array_uri, - "URI of TileDB BED array containing regions to export") - ->excludes("--regions"); cmd->add_flag( "--sorted", args->sort_regions, diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 3392d8536..2e8c27c0e 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "dataset/attribute_buffer_set.h" @@ -108,29 +109,9 @@ void Reader::set_samples_file(const std::string& uri) { } void Reader::set_bed_file(const std::string& uri) { - if (vfs_ == nullptr) - init_tiledb(); - - if (!vfs_->is_file(uri)) - throw std::runtime_error( - "Error setting BED file; '" + uri + "' does not exist."); params_.regions_file_uri = uri; } -void Reader::set_bed_array(const std::string& uri) { - if (vfs_ == nullptr) { - init_tiledb(); - } - - auto obj = Object::object(*ctx_, uri); - if (obj.type() != Object::Type::Array) { - throw std::runtime_error( - "Error setting BED array: '" + uri + "' is not a TileDB Array"); - } - params_.bed_array_uri = uri; - LOG_DEBUG("BED array URI set to {}", uri); -} - void Reader::set_region_partition( uint64_t partition_idx, uint64_t num_partitions) { check_partitioning(partition_idx, num_partitions); @@ -1954,31 +1935,48 @@ void Reader::prepare_regions_v4( // Add BED file regions, if specified. if (!params_.regions_file_uri.empty()) { auto start_bed_file_parse = std::chrono::steady_clock::now(); - Region::parse_bed_file_htslib( - params_.regions_file_uri, &pre_partition_regions_list); - LOG_INFO(fmt::format( - std::locale(""), - "Parsed bed file into {:L} regions in {:.3f} seconds.", - pre_partition_regions_list.size(), - utils::chrono_duration(start_bed_file_parse))); - } - - // Add BED array regions, if specified. - if (!params_.bed_array_uri.empty()) { - auto start_bed_array_parse = std::chrono::steady_clock::now(); - // Open the BED array - auto array = - std::make_shared(*ctx_, params_.bed_array_uri, TILEDB_READ); + // If the URI points to an array, it is either a BED file or a BED array + if (Object::object(*ctx_, params_.regions_file_uri).type() == + Object::Type::Array) { + auto array = + std::make_shared(*ctx_, params_.regions_file_uri, TILEDB_READ); + + // Export a temporary copy of the BED file + auto temp_bed_file = utils::temp_filename(".bed"); + auto status = tiledb_filestore_uri_export( + ctx_->ptr().get(), + temp_bed_file.c_str(), + params_.regions_file_uri.c_str()); + + if (status == TILEDB_OK) { + // Parse the BED file + LOG_DEBUG( + "[Reader] Parsing filestore BED file '{}'", + params_.regions_file_uri); + Region::parse_bed_file_htslib( + temp_bed_file, &pre_partition_regions_list); + + // Remove the temporary BED file + // std::remove(temp_bed_file.c_str()); + } else { + // Parse the BED array + LOG_DEBUG("[Reader] Parsing BED array '{}'", params_.regions_file_uri); + Region::read_bed_array(array, pre_partition_regions_list); + } - // Load the regions from the BED array - Region::read_bed_array(array, pre_partition_regions_list); + } else { + // The URI is not an array, treat it as a BED file + LOG_DEBUG("[Reader] Parsing BED file '{}'", params_.regions_file_uri); + Region::parse_bed_file_htslib( + params_.regions_file_uri, &pre_partition_regions_list); + } LOG_INFO(fmt::format( std::locale(""), - "Parsed bed array into {:L} regions in {:.3f} seconds.", + "Parsed bed file into {:L} regions in {:.3f} seconds.", pre_partition_regions_list.size(), - utils::chrono_duration(start_bed_array_parse))); + utils::chrono_duration(start_bed_file_parse))); } std::pair region_non_empty_domain = diff --git a/libtiledbvcf/src/read/reader.h b/libtiledbvcf/src/read/reader.h index 477a5c750..12d01b584 100644 --- a/libtiledbvcf/src/read/reader.h +++ b/libtiledbvcf/src/read/reader.h @@ -93,7 +93,6 @@ struct ExportParams { std::string log_file; std::string samples_file_uri; std::string regions_file_uri; - std::string bed_array_uri; std::vector sample_names; std::vector regions; std::string output_dir; @@ -215,9 +214,6 @@ class Reader { /** Sets the BED file URI parameter. */ void set_bed_file(const std::string& uri); - /** Sets the BED array URI parameter. */ - void set_bed_array(const std::string& uri); - /** Sets the region partitioning. */ void set_region_partition(uint64_t partition_idx, uint64_t num_partitions); diff --git a/libtiledbvcf/src/utils/utils.cc b/libtiledbvcf/src/utils/utils.cc index b3396e791..7b36f5879 100644 --- a/libtiledbvcf/src/utils/utils.cc +++ b/libtiledbvcf/src/utils/utils.cc @@ -30,8 +30,10 @@ #include #endif #include +#include #include #include +#include #include "htslib_plugin/hfile_tiledb_vfs.h" #include "utils/logger_public.h" @@ -630,6 +632,15 @@ bool query_buffers_set(tiledb::Query* query) { return false; } +std::string temp_filename(const std::string& extension) { + std::random_device rd; + std::stringstream filename; + filename << "vcf_tmp_" << rd() << extension; + std::filesystem::path path = + std::filesystem::temp_directory_path() / filename.str(); + return path.string(); +} + } // namespace utils } // namespace vcf } // namespace tiledb diff --git a/libtiledbvcf/src/utils/utils.h b/libtiledbvcf/src/utils/utils.h index 3c95a9145..52031404c 100644 --- a/libtiledbvcf/src/utils/utils.h +++ b/libtiledbvcf/src/utils/utils.h @@ -513,6 +513,13 @@ bool contains(std::vector& vec, T& item) { */ bool query_buffers_set(tiledb::Query* query); +/** + * @brief Return a random temporary file name + * + * @return std::string full path to temporary file + */ +std::string temp_filename(const std::string& extension = ""); + } // namespace utils } // namespace vcf } // namespace tiledb diff --git a/libtiledbvcf/src/vcf/region.cc b/libtiledbvcf/src/vcf/region.cc index b8812c032..6aecf7802 100644 --- a/libtiledbvcf/src/vcf/region.cc +++ b/libtiledbvcf/src/vcf/region.cc @@ -181,10 +181,9 @@ void Region::read_bed_array( auto chromStart = mq.data(column_alias[START])[i]; auto chromEnd = mq.data(column_alias[END])[i]; - // Each region in the BED array is 0-indexed and inclusive - // [0-start, end] which is the same format as the Region struct, so no - // modification is required. - result.emplace_back(chrom, chromStart, chromEnd, line++); + // BED files contain 0-indexed, half-open regions. Convert to a + // 0-based, inclusive Region object. + result.emplace_back(chrom, chromStart, chromEnd - 1, line++); } } } diff --git a/libtiledbvcf/test/src/unit-c-api-reader.cc b/libtiledbvcf/test/src/unit-c-api-reader.cc index 928e1466f..10c2846ae 100644 --- a/libtiledbvcf/test/src/unit-c-api-reader.cc +++ b/libtiledbvcf/test/src/unit-c-api-reader.cc @@ -614,10 +614,6 @@ TEST_CASE("C API: Reader set BED file", "[capi][reader]") { REQUIRE(tiledb_vcf_reader_init(reader, dataset_uri.c_str()) == TILEDB_VCF_OK); - REQUIRE( - tiledb_vcf_reader_set_bed_file(reader, "file:///does/not/exist.bed") == - TILEDB_VCF_ERR); - auto bed_uri = TILEDB_VCF_TEST_INPUT_DIR + std::string("/simple.bed"); REQUIRE( tiledb_vcf_reader_set_bed_file(reader, bed_uri.c_str()) == TILEDB_VCF_OK); @@ -625,30 +621,6 @@ TEST_CASE("C API: Reader set BED file", "[capi][reader]") { tiledb_vcf_reader_free(&reader); } -TEST_CASE("C API: Reader set BED array", "[capi][reader]") { - tiledb_vcf_reader_t* reader = nullptr; - REQUIRE(tiledb_vcf_reader_alloc(&reader) == TILEDB_VCF_OK); - - std::string dataset_uri; - dataset_uri = INPUT_ARRAYS_DIR_V4 + "/ingested_2samples"; - - REQUIRE(tiledb_vcf_reader_init(reader, dataset_uri.c_str()) == TILEDB_VCF_OK); - - REQUIRE( - tiledb_vcf_reader_set_bed_array(reader, "file:///does/not/exist.bed") == - TILEDB_VCF_ERR); - - // This check is making sure we can set the bed URI without an error, - // it doesn't matter if the URI is a BED array. Since we don't have a BED - // array in the test data, we use the data array. - auto bed_uri = dataset_uri + "/data"; - REQUIRE( - tiledb_vcf_reader_set_bed_array(reader, bed_uri.c_str()) == - TILEDB_VCF_OK); - - tiledb_vcf_reader_free(&reader); -} - TEST_CASE("C API: Reader set buffers", "[capi][reader]") { tiledb_vcf_reader_t* reader = nullptr; REQUIRE(tiledb_vcf_reader_alloc(&reader) == TILEDB_VCF_OK);