From 286b29b43ebb3d58a61a8d788cc655c73a65f25a Mon Sep 17 00:00:00 2001 From: hengfei yang Date: Wed, 14 Aug 2024 23:53:02 +0800 Subject: [PATCH 1/5] feat: add contains automaton --- Cargo.toml | 5 ++- examples/contains.rs | 16 ++++++++ src/automaton/contains.rs | 78 +++++++++++++++++++++++++++++++++++++++ src/automaton/mod.rs | 9 +++++ 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 examples/contains.rs create mode 100644 src/automaton/contains.rs diff --git a/Cargo.toml b/Cargo.toml index ca497485..fd155997 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fst" -version = "0.4.7" #:version +version = "0.4.8" #:version authors = ["Andrew Gallant "] description = """ Use finite state transducers to compactly represents sets or maps of many @@ -19,8 +19,9 @@ members = ["bench", "fst-bin"] exclude = ["fst-levenshtein", "fst-regex"] [features] -default = [] +default = ["levenshtein", "contains"] levenshtein = ["utf8-ranges"] +contains = ["utf8-ranges"] [patch.crates-io] fst = { path = "." } diff --git a/examples/contains.rs b/examples/contains.rs new file mode 100644 index 00000000..87e3c6e4 --- /dev/null +++ b/examples/contains.rs @@ -0,0 +1,16 @@ +use fst::{automaton::Contains, IntoStreamer, Set}; + +fn main() -> Result<(), Box> { + let paths = vec!["a foo bar", "foo", "foo1", "foo12", "foo3", "foobar"]; + let set = Set::from_iter(paths)?; + + // Build our prefix query. + let prefix = Contains::new("foob"); + + // Apply our query to the set we built. + let stream = set.search(&prefix).into_stream(); + + let matches = stream.into_strs()?; + println!("{:?}", matches); + Ok(()) +} diff --git a/src/automaton/contains.rs b/src/automaton/contains.rs new file mode 100644 index 00000000..724a7519 --- /dev/null +++ b/src/automaton/contains.rs @@ -0,0 +1,78 @@ +use crate::automaton::{Automaton, StartsWith}; + +/// An automaton that matches if the input contains to a specific string. +/// +/// ```rust +/// extern crate fst; +/// +/// use fst::{Automaton, IntoStreamer, Streamer, Set}; +/// use fst::automaton::Contains; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<(), Box> { +/// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; +/// let set = Set::from_iter(paths)?; +/// +/// // Build our contains query. +/// let keyword = Contains::new("/projects"); +/// +/// // Apply our query to the set we built. +/// let mut stream = set.search(keyword).into_stream(); +/// +/// let matches = stream.into_strs()?; +/// assert_eq!(matches, vec!["/home/projects/bar", "/home/projects/foo"]); +/// Ok(()) +/// } +/// ``` +#[derive(Clone, Debug)] +pub struct Contains<'a> { + string: &'a [u8], +} + +impl<'a> Contains<'a> { + /// Constructs automaton that matches an exact string. + #[inline] + pub fn new(string: &'a str) -> StartsWith> { + Self { string: string.as_bytes() }.starts_with() + } +} + +impl<'a> Automaton for Contains<'a> { + type State = Option; + + #[inline] + fn start(&self) -> Option { + Some(0) + } + + #[inline] + fn is_match(&self, pos: &Option) -> bool { + pos.is_some() && pos.unwrap() >= self.string.len() + } + + #[inline] + fn can_match(&self, pos: &Option) -> bool { + pos.is_some() + } + + #[inline] + fn accept(&self, pos: &Option, byte: u8) -> Option { + // if we aren't already past the end... + if let Some(pos) = *pos { + // and there is still a matching byte at the current position... + if self.string.get(pos).cloned() == Some(byte) { + // then move forward + return Some(pos + 1); + } else { + if pos >= self.string.len() { + // if we're past the end, then we're done + return Some(i32::MAX as usize); + } else { + return Some(0); + } + } + } + // otherwise we're either past the end or didn't match the byte + None + } +} diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index fe503ed6..856a1262 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -1,9 +1,18 @@ + +#[cfg(feature = "contains")] +pub use self::contains::Contains; + +#[cfg(feature = "contains")] +mod contains; + #[cfg(feature = "levenshtein")] pub use self::levenshtein::{Levenshtein, LevenshteinError}; #[cfg(feature = "levenshtein")] mod levenshtein; + + /// Automaton describes types that behave as a finite automaton. /// /// All implementors of this trait are represented by *byte based* automata. From c3432d1936280b498e8b1ac9e35325f5feea2ca1 Mon Sep 17 00:00:00 2001 From: hengfei yang Date: Wed, 14 Aug 2024 23:56:06 +0800 Subject: [PATCH 2/5] chore: update comment --- examples/contains.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/contains.rs b/examples/contains.rs index 87e3c6e4..a6ff5030 100644 --- a/examples/contains.rs +++ b/examples/contains.rs @@ -4,7 +4,7 @@ fn main() -> Result<(), Box> { let paths = vec!["a foo bar", "foo", "foo1", "foo12", "foo3", "foobar"]; let set = Set::from_iter(paths)?; - // Build our prefix query. + // Build our contains query. let prefix = Contains::new("foob"); // Apply our query to the set we built. From 9eca66caa5d7ecad3f7f80a2cc45162efdf15cf3 Mon Sep 17 00:00:00 2001 From: hengfei yang Date: Thu, 15 Aug 2024 00:09:58 +0800 Subject: [PATCH 3/5] fix: default features --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index fd155997..61f5aa5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ members = ["bench", "fst-bin"] exclude = ["fst-levenshtein", "fst-regex"] [features] -default = ["levenshtein", "contains"] +default = [] levenshtein = ["utf8-ranges"] contains = ["utf8-ranges"] From 68c09f214d37dac1cd4d24f4ffa06b6bb5adbfbf Mon Sep 17 00:00:00 2001 From: hengfei yang Date: Thu, 15 Aug 2024 00:20:49 +0800 Subject: [PATCH 4/5] fix: improve code --- examples/contains.rs | 16 ---------------- src/automaton/contains.rs | 4 ++-- src/automaton/mod.rs | 11 ++++------- 3 files changed, 6 insertions(+), 25 deletions(-) delete mode 100644 examples/contains.rs diff --git a/examples/contains.rs b/examples/contains.rs deleted file mode 100644 index a6ff5030..00000000 --- a/examples/contains.rs +++ /dev/null @@ -1,16 +0,0 @@ -use fst::{automaton::Contains, IntoStreamer, Set}; - -fn main() -> Result<(), Box> { - let paths = vec!["a foo bar", "foo", "foo1", "foo12", "foo3", "foobar"]; - let set = Set::from_iter(paths)?; - - // Build our contains query. - let prefix = Contains::new("foob"); - - // Apply our query to the set we built. - let stream = set.search(&prefix).into_stream(); - - let matches = stream.into_strs()?; - println!("{:?}", matches); - Ok(()) -} diff --git a/src/automaton/contains.rs b/src/automaton/contains.rs index 724a7519..389df4b6 100644 --- a/src/automaton/contains.rs +++ b/src/automaton/contains.rs @@ -32,8 +32,8 @@ pub struct Contains<'a> { impl<'a> Contains<'a> { /// Constructs automaton that matches an exact string. #[inline] - pub fn new(string: &'a str) -> StartsWith> { - Self { string: string.as_bytes() }.starts_with() + pub fn new(string: &'a str) -> Contains<'a> { + Self { string: string.as_bytes() } } } diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index 856a1262..c09d5ec1 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -1,17 +1,14 @@ - -#[cfg(feature = "contains")] -pub use self::contains::Contains; - -#[cfg(feature = "contains")] -mod contains; - #[cfg(feature = "levenshtein")] pub use self::levenshtein::{Levenshtein, LevenshteinError}; #[cfg(feature = "levenshtein")] mod levenshtein; +#[cfg(feature = "contains")] +pub use self::contains::Contains; +#[cfg(feature = "contains")] +mod contains; /// Automaton describes types that behave as a finite automaton. /// From 17b15aa6d331e71692163647ac95431b6a425141 Mon Sep 17 00:00:00 2001 From: hengfei yang Date: Thu, 15 Aug 2024 00:22:06 +0800 Subject: [PATCH 5/5] chore: update comment --- src/automaton/contains.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/automaton/contains.rs b/src/automaton/contains.rs index 389df4b6..98d4ccb4 100644 --- a/src/automaton/contains.rs +++ b/src/automaton/contains.rs @@ -30,7 +30,7 @@ pub struct Contains<'a> { } impl<'a> Contains<'a> { - /// Constructs automaton that matches an exact string. + /// Constructs automaton that matches a part of string. #[inline] pub fn new(string: &'a str) -> Contains<'a> { Self { string: string.as_bytes() }