Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions python/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,21 @@
* limitations under the License.
*/

use crate::dictionary::get_default_resource_dir;
use crate::errors;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyList, PyString, PyTuple, PyType};
use std::fs::{File, OpenOptions};
use std::io::BufWriter;
use std::path::Path;

use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyList, PyString, PyTuple, PyType};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::Config;
use sudachi::dic::build::{DataSource, DictBuilder};
use sudachi::dic::dictionary::JapaneseDictionary;

use crate::dictionary::get_default_resource_dir;
use crate::errors;

pub fn register_functions(m: &Bound<PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(build_system_dic, m)?)?;
m.add_function(wrap_pyfunction!(build_user_dic, m)?)?;
Expand Down Expand Up @@ -80,7 +83,9 @@ fn build_system_dic<'py>(
description: Option<&str>,
) -> PyResult<Bound<'py, PyList>> {
let mut builder = DictBuilder::new_system();
description.map(|d| builder.set_description(d));
if let Some(d) = description {
builder.set_description(d)
}

let matrix_path = resolve_as_pypathstr(py, matrix)?;
let matrix_src = as_data_source(matrix_path.as_ref(), matrix)?;
Expand Down Expand Up @@ -138,7 +143,9 @@ fn build_user_dic<'py>(
};

let mut builder = DictBuilder::new_user(&system_dic);
description.map(|d| builder.set_description(d));
if let Some(d) = description {
builder.set_description(d)
}

for f in lex.iter() {
let lex_path = resolve_as_pypathstr(py, &f)?;
Expand Down
11 changes: 6 additions & 5 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,18 @@
* limitations under the License.
*/

use pyo3::prelude::*;
use pyo3::types::{PySet, PyString, PyTuple};
use std::convert::TryFrom;
use std::fmt::Write;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use sudachi::analysis::Mode;

use pyo3::prelude::*;
use pyo3::types::{PySet, PyString, PyTuple};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::analysis::Mode;
use sudachi::config::{Config, ConfigBuilder, SurfaceProjection};
use sudachi::dic::dictionary::JapaneseDictionary;
use sudachi::dic::grammar::Grammar;
Expand Down Expand Up @@ -447,7 +448,7 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> {
Ok(result)
}

pub(crate) fn extract_mode<'py>(mode: &Bound<'py, PyAny>) -> PyResult<Mode> {
pub(crate) fn extract_mode(mode: &Bound<'_, PyAny>) -> PyResult<Mode> {
if mode.is_instance_of::<PyString>() {
errors::wrap(Mode::from_str(mode.str()?.to_str()?))
} else if mode.is_instance_of::<PySplitMode>() {
Expand All @@ -471,7 +472,7 @@ fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> {
let config_pystr = config_opt.str()?;
let config_str = config_pystr.to_str()?.trim();
// looks like json
if config_str.starts_with("{") && config_str.ends_with("}") {
if config_str.starts_with('{') && config_str.ends_with('}') {
let result = ConfigBuilder::from_bytes(config_str.as_bytes());
return errors::wrap(result);
}
Expand Down
3 changes: 2 additions & 1 deletion python/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
* limitations under the License.
*/

use std::fmt::{Debug, Display};

use pyo3::exceptions::PyDeprecationWarning;
use pyo3::prelude::*;
use pyo3::{import_exception, PyResult};
use std::fmt::{Debug, Display};

// Sudachi exception class is defined in Python
import_exception!(sudachipy.errors, SudachiError);
Expand Down
4 changes: 2 additions & 2 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ impl PyMorphemeListWrapper {
for (i, m) in list.iter().enumerate() {
result.push_str(m.surface().deref());
if i + 1 != nmorphs {
result.push_str(" ");
result.push(' ');
}
}
PyString::new_bound(py, result.as_str())
Expand Down Expand Up @@ -196,7 +196,7 @@ impl PyMorphemeListWrapper {
}

fn __bool__(&self, py: Python) -> bool {
self.internal(py).len() != 0
!self.internal(py).is_empty()
}
}

Expand Down
3 changes: 1 addition & 2 deletions python/src/pos_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl PyPosMatcher {
fn create_from_fn(dic: &Arc<PyDicData>, func: &Bound<PyAny>, py: Python) -> PyResult<Self> {
let mut data = Vec::new();
for (pos_id, pos) in dic.pos.iter().enumerate() {
let args = PyTuple::new_bound(py, &[pos]);
let args = PyTuple::new_bound(py, [pos]);
if func.call1(args)?.downcast::<PyBool>()?.is_true() {
data.push(pos_id as u16);
}
Expand Down Expand Up @@ -198,7 +198,6 @@ impl PyPosMatcher {
let max_id = self.dic.pos.len();
// map -> filter chain is needed to handle exactly u16::MAX POS entries
let values = (0..max_id)
.into_iter()
.map(|x| x as u16)
.filter(|id| !self.matcher.matches_id(*id));
let matcher = PosMatcher::new(values);
Expand Down
18 changes: 10 additions & 8 deletions python/src/pretokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,23 @@
* limitations under the License.
*/

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use std::cell::RefCell;
use std::sync::Arc;

use pyo3::intern;
use pyo3::prelude::*;
use pyo3::sync::GILOnceCell;
use pyo3::types::{PyList, PySlice, PyTuple, PyType};
use std::cell::RefCell;
use std::sync::Arc;
use thread_local::ThreadLocal;

use crate::projection::MorphemeProjection;
use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
use sudachi::dic::subset::InfoSubset;
use sudachi::prelude::Mode;
use thread_local::ThreadLocal;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use crate::projection::MorphemeProjection;

/// This struct perform actual tokenization
/// There should be at most one instance per thread of execution
Expand Down Expand Up @@ -152,7 +154,7 @@ impl PyPretokenizer {
}
Some(h) => {
let mrp: &Bound<PyAny> = morphs.bind(py);
let args = PyTuple::new_bound(py, &[index, string, mrp]);
let args = PyTuple::new_bound(py, [index, string, mrp]);
h.bind(py).call1(args)
}
}
Expand Down
20 changes: 11 additions & 9 deletions python/src/projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,23 @@
* limitations under the License.
*/

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::PyProjector;
use pyo3::prelude::*;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};
use std::convert::TryFrom;
use std::ops::Deref;
use std::sync::Arc;

use pyo3::prelude::*;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::SurfaceProjection;
use sudachi::pos::PosMatcher;
use sudachi::prelude::Morpheme;

use crate::dictionary::PyDicData;
use crate::errors;
use crate::morpheme::PyProjector;

pub(crate) trait MorphemeProjection {
fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> Bound<'py, PyString>;
}
Expand Down Expand Up @@ -114,9 +117,8 @@ impl MorphemeProjection for NormalizedNouns {
}

fn conjugating_matcher<D: DictionaryAccess>(dic: &D) -> PosMatcher {
make_matcher(dic, |pos| match pos[0].deref() {
"動詞" | "形容詞" | "助動詞" => true,
_ => false,
make_matcher(dic, |pos| {
matches!(pos[0].deref(), "動詞" | "形容詞" | "助動詞")
})
}

Expand Down
3 changes: 1 addition & 2 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use std::sync::Arc;
use pyo3::prelude::*;

use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;

use sudachi::dic::subset::InfoSubset;
use sudachi::prelude::*;

Expand Down Expand Up @@ -157,7 +156,7 @@ impl PyTokenizer {
None => None,
Some(m) => Some(extract_mode(m)?),
};
let default_mode = mode.map(|m| self.tokenizer.set_mode(m.into()));
let default_mode = mode.map(|m| self.tokenizer.set_mode(m));
let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| {
default_mode.map(|m| t.set_mode(m));
});
Expand Down
4 changes: 2 additions & 2 deletions sudachi-cli/src/build.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,12 +14,12 @@
* limitations under the License.
*/

use memmap2::Mmap;
use std::fs::{File, OpenOptions};
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};

use clap::{Args, Subcommand};
use memmap2::Mmap;

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::Config;
Expand Down
17 changes: 6 additions & 11 deletions sudachi-cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,22 +34,17 @@ use sudachi::prelude::*;
#[cfg(feature = "bake_dictionary")]
const BAKED_DICTIONARY_BYTES: &[u8] = include_bytes!(env!("SUDACHI_DICT_PATH"));

#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, Eq, PartialEq, Default)]
pub enum SentenceSplitMode {
/// Do both sentence splitting and analysis
#[default]
Default,
/// Do only sentence splitting and not analysis
Only,
/// Do only analysis without sentence splitting
None,
}

impl Default for SentenceSplitMode {
fn default() -> Self {
SentenceSplitMode::Default
}
}

impl FromStr for SentenceSplitMode {
type Err = &'static str;

Expand Down Expand Up @@ -156,7 +151,7 @@ fn main() {
// output: stdout or file
let inner_writer: Box<dyn Write> = match &args.output_file {
Some(output_path) => Box::new(
File::create(&output_path)
File::create(output_path)
.unwrap_or_else(|_| panic!("Failed to open output file {:?}", &output_path)),
),
None => Box::new(io::stdout()),
Expand Down Expand Up @@ -207,10 +202,10 @@ fn strip_eol(data: &str) -> &str {
let mut bytes = data.as_bytes();
let mut len = bytes.len();
if len > 1 && bytes[len - 1] == b'\n' {
len = len - 1;
len -= 1;
bytes = &bytes[..len];
if len > 1 && bytes[len - 1] == b'\r' {
len = len - 1;
len -= 1;
bytes = &bytes[..len];
}
}
Expand Down
4 changes: 2 additions & 2 deletions sudachi-cli/src/output.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,7 @@ impl Wakachi {

impl<T: DictionaryAccess> SudachiOutput<T> for Wakachi {
fn write(&self, writer: &mut Writer, morphemes: &MorphemeList<T>) -> SudachiResult<()> {
if morphemes.len() == 0 {
if morphemes.is_empty() {
writer.write_all(b"\n")?;
return Ok(());
}
Expand Down
18 changes: 8 additions & 10 deletions sudachi/src/analysis/created.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,7 +40,7 @@ impl CreatedWords {
const MAX_SHIFT: Carrier = CreatedWords::MAX_VALUE - 1;

pub fn empty() -> CreatedWords {
return Default::default();
Default::default()
}

pub fn single<Pos: Into<i64>>(length: Pos) -> CreatedWords {
Expand All @@ -55,7 +55,7 @@ impl CreatedWords {
#[must_use]
pub fn add_word<P: Into<i64>>(&self, length: P) -> CreatedWords {
let mask = CreatedWords::single(length);
return self.add(mask);
self.add(mask)
}

#[must_use]
Expand All @@ -67,21 +67,19 @@ impl CreatedWords {
let mask = CreatedWords::single(length);
if (self.0 & mask.0) == 0 {
HasWord::No
} else if length.into() >= CreatedWords::MAX_VALUE as _ {
HasWord::Maybe
} else {
if length.into() >= CreatedWords::MAX_VALUE as _ {
HasWord::Maybe
} else {
HasWord::Yes
}
HasWord::Yes
}
}

pub fn is_empty(&self) -> bool {
return self.0 == 0;
self.0 == 0
}

pub fn not_empty(&self) -> bool {
return !self.is_empty();
!self.is_empty()
}
}

Expand Down
Loading