From b87422dbc3a5656f49c887c1eb606a6fb5f9fe1c Mon Sep 17 00:00:00 2001 From: Jakob Ledermann Date: Tue, 13 Sep 2022 22:57:04 +0200 Subject: [PATCH] Add a commandline option '--delimiter' to `map`. This commandline option allows to specify the delimiter between keys and the values in the input file to the `map` subcommand. The provided delimiter should be a single byte utf-8 character. If the value consists of multiple bytes only the first one is used. This parameter is passed to `csv::ReaderBuilder::delimiter` which only allows for a single byte as delimiter. --- fst-bin/src/app.rs | 5 ++++- fst-bin/src/cmd/map.rs | 24 +++++++++++++++++++++++- fst-bin/src/util.rs | 6 ++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/fst-bin/src/app.rs b/fst-bin/src/app.rs index f8ecb4c8..c18dc790 100644 --- a/fst-bin/src/app.rs +++ b/fst-bin/src/app.rs @@ -288,7 +288,10 @@ pub fn app() -> clap::App<'static, 'static> { )) .arg(flag("keep-tmp-dir").help( "Does not delete the temporary directory. Useful for debugging.", - )); + )) + .arg(flag("delimiter").help( + "The delimiter used in the CSV file to separate key and value in each line. \ + This defaults to ','.",)); let node = cmd("node", ABOUT_NODE) .arg(pos("input").required(true).help("The FST to inspect.")) diff --git a/fst-bin/src/cmd/map.rs b/fst-bin/src/cmd/map.rs index d6e356e2..4a49d422 100644 --- a/fst-bin/src/cmd/map.rs +++ b/fst-bin/src/cmd/map.rs @@ -26,6 +26,17 @@ struct Args { keep_tmp_dir: bool, max: bool, min: bool, + delimiter: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct DelimiterInvalidError; + +impl std::error::Error for DelimiterInvalidError {} +impl std::fmt::Display for DelimiterInvalidError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "The provided value is no valid delimiter") + } } impl Args { @@ -47,6 +58,15 @@ impl Args { keep_tmp_dir: m.is_present("keep-tmp-dir"), max: m.is_present("max"), min: m.is_present("min"), + delimiter: m + .value_of_lossy("delimiter") + .map(|x| { + x.as_bytes() + .get(0) + .map(|y| *y) + .ok_or(DelimiterInvalidError) + }) + .transpose()?, }) } @@ -66,6 +86,7 @@ impl Args { let mut map = MapBuilder::new(wtr)?; for input in &self.input { let mut rdr = csv::ReaderBuilder::new() + .delimiter(self.delimiter.unwrap_or(b',')) .has_headers(false) .from_reader(util::get_reader(Some(input))?); for row in rdr.deserialize() { @@ -82,7 +103,8 @@ impl Args { .iter() .map(|inp| Path::new(inp).to_path_buf()) .collect(); - let keys = util::ConcatCsv::new(inputs); + let keys = + util::ConcatCsv::new(inputs, self.delimiter.unwrap_or(b',')); let mut merger = Merger::new(keys, &self.output); merger = merger.fd_limit(self.fd_limit); diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs index dfe4970c..17028f03 100644 --- a/fst-bin/src/util.rs +++ b/fst-bin/src/util.rs @@ -135,15 +135,16 @@ impl Iterator for ConcatLines { pub struct ConcatCsv { inputs: Vec, cur: Option, + delimiter: u8, } type Reader = Box; type Rows = csv::DeserializeRecordsIntoIter; impl ConcatCsv { - pub fn new(mut inputs: Vec) -> ConcatCsv { + pub fn new(mut inputs: Vec, delimiter: u8) -> ConcatCsv { inputs.reverse(); // treat it as a stack - ConcatCsv { inputs, cur: None } + ConcatCsv { inputs, cur: None, delimiter } } fn read_row(&mut self) -> Option> { @@ -173,6 +174,7 @@ impl Iterator for ConcatCsv { Ok(rdr) => rdr, }; let csvrdr = csv::ReaderBuilder::new() + .delimiter(self.delimiter) .has_headers(false) .from_reader(rdr); self.cur = Some(csvrdr.into_deserialize());