Skip to content

Commit

Permalink
config: Separate normalizer and tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
tontinton committed Jun 1, 2024
1 parent 81cdd36 commit 27efbbb
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 53 deletions.
49 changes: 37 additions & 12 deletions src/config/dynamic_object.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
use serde::{Deserialize, Serialize};
use tantivy::schema::{JsonObjectOptions, TextFieldIndexing};
use tantivy::schema::{IndexRecordOption, JsonObjectOptions, TextFieldIndexing};

use super::{
default_true,
text::{FastTextFieldType, IndexedTextFieldType},
};
use super::{default_true, FastFieldNormalizerType, FieldTokenizerType};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexedDynamicObjectFieldConfig {
#[serde(default)]
pub record: IndexRecordOption,

#[serde(default = "default_tokenizer")]
pub tokenizer: FieldTokenizerType,
}

fn default_tokenizer() -> FieldTokenizerType {
FieldTokenizerType::Raw
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum IndexedDynamicObjectFieldType {
False,
#[default]
True,
Indexed(IndexedDynamicObjectFieldConfig),
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DynamicObjectFieldConfig {
#[serde(default = "default_true")]
pub stored: bool,

#[serde(default)]
pub fast: FastTextFieldType,
#[serde(default = "default_fast_normalizer")]
pub fast: FastFieldNormalizerType,

#[serde(default)]
pub indexed: IndexedTextFieldType,
pub indexed: IndexedDynamicObjectFieldType,

#[serde(default = "default_true")]
pub expand_dots: bool,
}

fn default_fast_normalizer() -> FastFieldNormalizerType {
FastFieldNormalizerType::True
}

impl From<DynamicObjectFieldConfig> for JsonObjectOptions {
fn from(config: DynamicObjectFieldConfig) -> Self {
let mut options = JsonObjectOptions::default();
Expand All @@ -29,13 +52,15 @@ impl From<DynamicObjectFieldConfig> for JsonObjectOptions {
}
options = options.set_fast(config.fast.into());
match config.indexed {
IndexedTextFieldType::False => {}
IndexedTextFieldType::True => {
IndexedDynamicObjectFieldType::False => {}
IndexedDynamicObjectFieldType::True => {
options = options.set_indexing_options(TextFieldIndexing::default());
}
IndexedTextFieldType::Indexed(config) => {
IndexedDynamicObjectFieldType::Indexed(config) => {
options = options.set_indexing_options(
TextFieldIndexing::default().set_index_option(config.record),
TextFieldIndexing::default()
.set_index_option(config.record)
.set_tokenizer(config.tokenizer.into()),
);
}
}
Expand Down
61 changes: 60 additions & 1 deletion src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use color_eyre::eyre::Result;
use serde::{Deserialize, Serialize};
use tokio::fs::read_to_string;

use crate::config::dynamic_object::IndexedDynamicObjectFieldType;

use self::{
boolean::BooleanFieldConfig,
datetime::DateTimeFieldConfig,
Expand All @@ -30,6 +32,61 @@ fn default_true() -> bool {
true
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FastFieldNormalizerType {
False,

/// Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens.
True,

/// Does not process nor tokenize the text.
Raw,
}

impl From<FastFieldNormalizerType> for Option<&str> {
fn from(value: FastFieldNormalizerType) -> Self {
match value {
FastFieldNormalizerType::False => None,
FastFieldNormalizerType::True => Some("default"),
FastFieldNormalizerType::Raw => Some("raw"),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FieldTokenizerType {
/// Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens.
Default,

/// Does not process nor tokenize the text.
Raw,

/// Like `true`, but also applies stemming on the
/// resulting tokens. Stemming can improve the recall of your
/// search engine.
EnStem,

/// Splits the text on whitespaces.
Whitespace,
}

impl From<FieldTokenizerType> for &str {
fn from(value: FieldTokenizerType) -> Self {
match value {
FieldTokenizerType::Default => "default",
FieldTokenizerType::Raw => "raw",
FieldTokenizerType::EnStem => "en_stem",
FieldTokenizerType::Whitespace => "whitespace",
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FieldType {
Expand All @@ -56,7 +113,9 @@ impl FieldsConfig {
Boolean(config) => config.indexed,
Datetime(config) => config.indexed,
Ip(config) => config.indexed,
DynamicObject(config) => !matches!(config.indexed, IndexedTextFieldType::False),
DynamicObject(config) => {
!matches!(config.indexed, IndexedDynamicObjectFieldType::False)
}
}
}
}
Expand Down
67 changes: 27 additions & 40 deletions src/config/text.rs
Original file line number Diff line number Diff line change
@@ -1,52 +1,34 @@
use serde::{Deserialize, Serialize};
use tantivy::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};

use super::default_true;
use super::{FastFieldNormalizerType, FieldTokenizerType, default_true};

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum FastTextFieldType {
#[default]
False,

/// Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens.
True,
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexedTextFieldConfig {
#[serde(default)]
pub record: IndexRecordOption,

/// Does not process nor tokenize the text.
Raw,
#[serde(default = "default_true")]
pub fieldnorms: bool,

/// Like `true`, but also applies stemming on the
/// resulting tokens. Stemming can improve the recall of your
/// search engine.
EnStem,
#[serde(default = "default_tokenizer")]
pub tokenizer: FieldTokenizerType,
}

/// Splits the text on whitespaces.
Whitespace,
fn default_tokenizer() -> FieldTokenizerType {
FieldTokenizerType::Default
}

impl From<FastTextFieldType> for Option<&str> {
fn from(value: FastTextFieldType) -> Self {
match value {
FastTextFieldType::False => None,
FastTextFieldType::True => Some("default"),
FastTextFieldType::Raw => Some("raw"),
FastTextFieldType::EnStem => Some("en_stem"),
FastTextFieldType::Whitespace => Some("whitespace"),
impl Default for IndexedTextFieldConfig {
fn default() -> Self {
Self {
record: IndexRecordOption::default(),
fieldnorms: true,
tokenizer: FieldTokenizerType::Default,
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct IndexedTextFieldConfig {
#[serde(default)]
pub record: IndexRecordOption,

#[serde(default = "default_true")]
pub fieldnorms: bool,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum IndexedTextFieldType {
Expand All @@ -56,18 +38,22 @@ pub enum IndexedTextFieldType {
Indexed(IndexedTextFieldConfig),
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextFieldConfig {
#[serde(default = "default_true")]
pub stored: bool,

#[serde(default)]
pub fast: FastTextFieldType,
#[serde(default = "default_fast_normalizer")]
pub fast: FastFieldNormalizerType,

#[serde(default)]
pub indexed: IndexedTextFieldType,
}

fn default_fast_normalizer() -> FastFieldNormalizerType {
FastFieldNormalizerType::False
}

impl From<TextFieldConfig> for TextOptions {
fn from(config: TextFieldConfig) -> Self {
let mut options = TextOptions::default();
Expand All @@ -84,7 +70,8 @@ impl From<TextFieldConfig> for TextOptions {
options = options.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(config.record)
.set_fieldnorms(config.fieldnorms),
.set_fieldnorms(config.fieldnorms)
.set_tokenizer(config.tokenizer.into()),
);
}
}
Expand Down

0 comments on commit 27efbbb

Please sign in to comment.