Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index catalog #2

Merged
merged 4 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ opendal = { version = "0.46.0", features = ["services-fs"] }
pretty_env_logger = "0.5.0"
serde = { version = "1.0.201", features = ["derive", "rc"] }
serde_json = "1.0.117"
sqlx = { version = "0.7.4", features = ["postgres", "macros", "runtime-tokio"] }
serde_yaml = "0.9.34"
sqlx = { version = "0.7.4", features = ["postgres", "macros", "runtime-tokio", "json"] }
tantivy = "0.22.0"
tokio = { version = "1.37.0", features = ["full"] }
tokio-util = { version = "0.7.11", features = ["compat"] }
Expand Down
13 changes: 13 additions & 0 deletions example_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: 1
name: test
path: /tmp/toshokan
schema:
time_field: timestamp
mappings:
timestamp:
type: !datetime
formats:
- timestamp
indexed: true
stored: true
fast: true
5 changes: 0 additions & 5 deletions migrations/0001_index_files.up.sql

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
DROP TABLE index_files;
DROP TABLE indexes;
11 changes: 11 additions & 0 deletions migrations/0001_indexes.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CREATE TABLE IF NOT EXISTS indexes(
name TEXT PRIMARY KEY,
config jsonb NOT NULL
);

CREATE TABLE IF NOT EXISTS index_files(
id VARCHAR(36) PRIMARY KEY,
index_name TEXT NOT NULL REFERENCES indexes(name) ON DELETE CASCADE,
file_name TEXT NOT NULL,
footer_len BIGINT NOT NULL
);
35 changes: 27 additions & 8 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@ use clap::Parser;
#[derive(Parser, Debug, Clone)]
#[command(author, version, about, long_about = None)]
pub struct Args {
#[clap(
short,
long,
help = "Path to the index dir.",
default_value = "/tmp/toshokan"
)]
pub index_dir: String,

#[clap(
long,
help = "Postgres DB connection url.
Expand All @@ -24,6 +16,12 @@ Can also be provided by a DATABASE_URL env var, but only if this arg is not prov

#[derive(Parser, Debug, Clone)]
pub enum SubCommand {
#[clap(name = "create")]
Create(CreateArgs),

#[clap(name = "drop")]
Drop(DropArgs),

#[clap(name = "index")]
Index(IndexArgs),

Expand All @@ -34,8 +32,23 @@ pub enum SubCommand {
Search(SearchArgs),
}

#[derive(Parser, Debug, Clone)]
pub struct CreateArgs {
#[clap(help = "Path to the input config file.")]
pub config_path: String,
}

#[derive(Parser, Debug, Clone)]
pub struct DropArgs {
#[clap(help = "The index name.")]
pub name: String,
}

#[derive(Parser, Debug, Clone)]
pub struct IndexArgs {
#[clap(help = "The index name.")]
pub name: String,

#[clap(help = "Path to the input jsonl file you want to index.")]
pub input_path: String,

Expand All @@ -58,6 +71,9 @@ The memory is split evenly between all indexing threads, once a thread reaches i

#[derive(Parser, Debug, Clone)]
pub struct MergeArgs {
#[clap(help = "The index name.")]
pub name: String,

#[clap(
short,
long,
Expand All @@ -69,6 +85,9 @@ pub struct MergeArgs {

#[derive(Parser, Debug, Clone)]
pub struct SearchArgs {
#[clap(help = "The index name.")]
pub name: String,

#[clap(help = "Query in tantivy syntax.")]
pub query: String,

Expand Down
196 changes: 196 additions & 0 deletions src/index_config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
use std::{collections::HashMap, ops::Deref, path::Path};

use color_eyre::eyre::Result;
use serde::{Deserialize, Serialize};
use tantivy::{
schema::{IndexRecordOption, TextFieldIndexing, TextOptions},
DateOptions, DateTimePrecision,
};
use tokio::fs::read_to_string;

const VERSION: u32 = 1;

fn default_version() -> u32 {
VERSION
}

fn default_true() -> bool {
true
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum FastTextFieldType {
#[default]
Disabled,
Raw,
Lowercase,
}

impl From<FastTextFieldType> for Option<&str> {
fn from(value: FastTextFieldType) -> Self {
match value {
FastTextFieldType::Disabled => None,
FastTextFieldType::Raw => Some("raw"),
FastTextFieldType::Lowercase => Some("lowercase"),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct TextMappingConfig {
#[serde(default = "default_true")]
pub stored: bool,

#[serde(default)]
pub fast: FastTextFieldType,

#[serde(default)]
pub record: IndexRecordOption,

#[serde(default = "default_true")]
pub fieldnorms: bool,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum DateTimeFastPrecisionType {
#[default]
False,
True,
Seconds,
Milliseconds,
Microseconds,
Nanoseconds,
}

impl From<DateTimeFastPrecisionType> for Option<DateTimePrecision> {
fn from(value: DateTimeFastPrecisionType) -> Self {
use DateTimeFastPrecisionType::*;
match value {
False => None,
True | Seconds => Some(DateTimePrecision::Seconds),
Milliseconds => Some(DateTimePrecision::Milliseconds),
Microseconds => Some(DateTimePrecision::Microseconds),
Nanoseconds => Some(DateTimePrecision::Nanoseconds),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DateTimeFormatType {
Iso8601,
Rfc2822,
Rfc3339,
Timestamp,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DateTimeFormats(Vec<DateTimeFormatType>);

impl Default for DateTimeFormats {
fn default() -> Self {
Self(vec![
DateTimeFormatType::Rfc3339,
DateTimeFormatType::Timestamp,
])
}
}

impl Deref for DateTimeFormats {
type Target = Vec<DateTimeFormatType>;

fn deref(&self) -> &Self::Target {
&self.0
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DateTimeMappingConfig {
#[serde(default = "default_true")]
pub stored: bool,

#[serde(default = "default_true")]
pub indexed: bool,

#[serde(default)]
pub fast: DateTimeFastPrecisionType,

#[serde(default)]
pub formats: DateTimeFormats,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum MappingFieldType {
Text(TextMappingConfig),
Datetime(DateTimeMappingConfig),
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MappingConfig {
#[serde(rename = "type")]
pub type_: MappingFieldType,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct IndexSchema {
#[serde(default)]
pub mappings: HashMap<String, MappingConfig>,

#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
time_field: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct IndexConfig {
pub name: String,
pub path: String,

#[serde(default = "default_version")]
version: u32,

#[serde(default)]
pub schema: IndexSchema,
}

impl IndexConfig {
pub async fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let config_str = read_to_string(path).await?;
Ok(serde_yaml::from_str(&config_str)?)
}
}

impl From<TextMappingConfig> for TextOptions {
fn from(config: TextMappingConfig) -> Self {
let mut text_options = TextOptions::default();
if config.stored {
text_options = text_options.set_stored();
}
text_options = text_options.set_fast(config.fast.into());
text_options = text_options.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(config.record)
.set_fieldnorms(config.fieldnorms),
);
text_options
}
}

impl From<DateTimeMappingConfig> for DateOptions {
fn from(config: DateTimeMappingConfig) -> Self {
let mut date_options = DateOptions::default();
if config.stored {
date_options = date_options.set_stored();
}
if config.indexed {
date_options = date_options.set_indexed();
}
if let Some(precision) = config.fast.into() {
date_options = date_options.set_fast().set_precision(precision);
}
date_options
}
}
Loading
Loading