Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions python/src/build.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,18 +14,21 @@
* limitations under the License.
*/

use crate::dictionary::get_default_resource_dir;
use crate::errors;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyList, PyString, PyTuple, PyType};
use std::fs::{File, OpenOptions};
use std::io::BufWriter;
use std::path::Path;

use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyList, PyString, PyTuple, PyType};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::Config;
use sudachi::dic::build::{DataSource, DictBuilder};
use sudachi::dic::dictionary::JapaneseDictionary;

use crate::dictionary::get_default_resource_dir;
use crate::errors;

pub fn register_functions(m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(build_system_dic, m)?)?;
m.add_function(wrap_pyfunction!(build_user_dic, m)?)?;
Expand Down Expand Up @@ -68,12 +71,14 @@ fn build_system_dic<'p>(
description: Option<&str>,
) -> PyResult<&'p PyList> {
let mut builder = DictBuilder::new_system();
description.map(|d| builder.set_description(d));
if let Some(d) = description {
builder.set_description(d)
}

let matrix_src = as_data_source(py, matrix)?;
errors::wrap_ctx(builder.read_conn(matrix_src), matrix)?;
for f in lex.iter() {
let lex_src = as_data_source(py, &f)?;
let lex_src = as_data_source(py, f)?;
errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
}
let out_file = match as_data_source(py, output)? {
Expand Down Expand Up @@ -110,10 +115,12 @@ fn build_user_dic<'p>(
};

let mut builder = DictBuilder::new_user(&system_dic);
description.map(|d| builder.set_description(d));
if let Some(d) = description {
builder.set_description(d)
}

for f in lex.iter() {
let lex_src = as_data_source(py, &f)?;
let lex_src = as_data_source(py, f)?;
errors::wrap_ctx(builder.read_lexicon(lex_src), &f)?;
}
let out_file = match as_data_source(py, output)? {
Expand Down
22 changes: 10 additions & 12 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,18 +14,18 @@
* limitations under the License.
*/

use pyo3::prelude::*;
use pyo3::types::{PySet, PyString, PyTuple};
use std::convert::TryFrom;
use std::fmt::Write;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use sudachi::analysis::Mode;

use crate::errors::{wrap, wrap_ctx, SudachiError as SudachiErr};
use pyo3::prelude::*;
use pyo3::types::{PySet, PyString, PyTuple};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::analysis::Mode;
use sudachi::config::{Config, ConfigBuilder, SurfaceProjection};
use sudachi::dic::dictionary::JapaneseDictionary;
use sudachi::dic::grammar::Grammar;
Expand All @@ -35,6 +35,7 @@ use sudachi::plugin::input_text::InputTextPlugin;
use sudachi::plugin::oov::OovProviderPlugin;
use sudachi::plugin::path_rewrite::PathRewritePlugin;

use crate::errors::{wrap, wrap_ctx, SudachiError as SudachiErr};
use crate::morpheme::{PyMorphemeListWrapper, PyProjector};
use crate::pos_matcher::PyPosMatcher;
use crate::pretokenizer::PyPretokenizer;
Expand Down Expand Up @@ -178,10 +179,7 @@ impl PyDictionary {
}

let jdic = JapaneseDictionary::from_cfg(&config).map_err(|e| {
SudachiErr::new_err(format!(
"Error while constructing dictionary: {}",
e.to_string()
))
SudachiErr::new_err(format!("Error while constructing dictionary: {}", e))
})?;

let pos_data = jdic
Expand Down Expand Up @@ -414,7 +412,7 @@ fn config_repr(cfg: &Config) -> Result<String, std::fmt::Error> {
pub(crate) fn extract_mode<'py>(py: Python<'py>, mode: &'py PyAny) -> PyResult<Mode> {
if mode.is_instance_of::<PyString>() {
let mode = mode.str()?.to_str()?;
Mode::from_str(mode).map_err(|e| SudachiErr::new_err(e).into())
Mode::from_str(mode).map_err(SudachiErr::new_err)
} else if mode.is_instance_of::<PySplitMode>() {
let mode = mode.extract::<PySplitMode>()?;
Ok(Mode::from(mode))
Expand All @@ -431,7 +429,7 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
if config_opt.is_instance_of::<PyString>() {
let config_str = config_opt.str()?.to_str()?.trim();
// looks like json
if config_str.starts_with("{") && config_str.ends_with("}") {
if config_str.starts_with('{') && config_str.ends_with('}') {
let result = ConfigBuilder::from_bytes(config_str.as_bytes());
return wrap(result);
}
Expand All @@ -451,7 +449,7 @@ fn read_config(config_opt: &PyAny) -> PyResult<ConfigBuilder> {
return read_config(cfg_as_str);
}
Err(SudachiErr::new_err((
format!("passed config was not a string, json object or sudachipy.config.Config object"),
"passed config was not a string, json object or sudachipy.config.Config object".to_string(),
config_opt.into_py(py),
)))
}
Expand Down
4 changes: 3 additions & 1 deletion python/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
* limitations under the License.
*/

use pyo3::{import_exception, PyResult};
use std::fmt::{Debug, Display};

use pyo3::prelude::*;
use pyo3::{import_exception, PyResult};

// Sudachi exception class is defined in Python
import_exception!(sudachipy.errors, SudachiError);

Expand Down
10 changes: 4 additions & 6 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -158,7 +158,7 @@ impl PyMorphemeListWrapper {
for (i, m) in list.iter().enumerate() {
result.push_str(m.surface().deref());
if i + 1 != nmorphs {
result.push_str(" ");
result.push(' ');
}
}
PyString::new(py, result.as_str())
Expand Down Expand Up @@ -193,7 +193,7 @@ impl PyMorphemeListWrapper {
}

fn __bool__(&self, py: Python) -> bool {
self.internal(py).len() != 0
!self.internal(py).is_empty()
}
}

Expand Down Expand Up @@ -387,9 +387,7 @@ impl PyMorpheme {
let splitted = list
.internal(py)
.split_into(mode, self.index, out_ref)
.map_err(|e| {
PyException::new_err(format!("Error while splitting morpheme: {}", e.to_string()))
})?;
.map_err(|e| PyException::new_err(format!("Error while splitting morpheme: {}", e)))?;

if add_single.unwrap_or(true) && !splitted {
list.internal(py)
Expand Down
5 changes: 2 additions & 3 deletions python/src/pos_matcher.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,7 +49,7 @@ impl PyPosMatcher {
fn create_from_fn(dic: &Arc<PyDicData>, func: &PyAny, py: Python) -> PyResult<Self> {
let mut data = Vec::new();
for (pos_id, pos) in dic.pos.iter().enumerate() {
let args = PyTuple::new(py, &[pos]);
let args = PyTuple::new(py, [pos]);
if func.call1(args)?.downcast::<PyBool>()?.is_true() {
data.push(pos_id as u16);
}
Expand Down Expand Up @@ -178,7 +178,6 @@ impl PyPosMatcher {
let max_id = self.dic.pos.len();
// map -> filter chain is needed to handle exactly u16::MAX POS entries
let values = (0..max_id)
.into_iter()
.map(|x| x as u16)
.filter(|id| !self.matcher.matches_id(*id));
let matcher = PosMatcher::new(values);
Expand Down
20 changes: 11 additions & 9 deletions python/src/pretokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,21 +14,23 @@
* limitations under the License.
*/

use crate::dictionary::PyDicData;
use crate::errors::wrap;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use std::cell::RefCell;
use std::sync::Arc;

use pyo3::intern;
use pyo3::prelude::*;
use pyo3::sync::GILOnceCell;
use pyo3::types::{PyList, PySlice, PyTuple, PyType};
use std::cell::RefCell;
use std::sync::Arc;
use thread_local::ThreadLocal;

use crate::projection::MorphemeProjection;
use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;
use sudachi::dic::subset::InfoSubset;
use sudachi::prelude::Mode;
use thread_local::ThreadLocal;

use crate::dictionary::PyDicData;
use crate::errors::wrap;
use crate::morpheme::{PyMorphemeList, PyMorphemeListWrapper, PyProjector};
use crate::projection::MorphemeProjection;

/// This struct perform actual tokenization
/// There should be at most one instance per thread of execution
Expand Down Expand Up @@ -150,7 +152,7 @@ impl PyPretokenizer {
}
Some(h) => {
let mrp: &PyAny = morphs.as_ref(py);
let args = PyTuple::new(py, &[index, string, mrp]);
let args = PyTuple::new(py, [index, string, mrp]);
h.as_ref(py).call1(args)
}
}
Expand Down
19 changes: 11 additions & 8 deletions python/src/projection.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,18 +14,22 @@
* limitations under the License.
*/

use crate::dictionary::PyDicData;
use crate::morpheme::PyProjector;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};
use std::convert::TryFrom;
use std::ops::Deref;
use std::sync::Arc;

use pyo3::prelude::*;
use pyo3::types::PyString;
use pyo3::{PyResult, Python};

use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::SurfaceProjection;
use sudachi::pos::PosMatcher;
use sudachi::prelude::Morpheme;

use crate::dictionary::PyDicData;
use crate::morpheme::PyProjector;

pub(crate) trait MorphemeProjection {
fn project<'py>(&self, m: &Morpheme<Arc<PyDicData>>, py: Python<'py>) -> &'py PyString;
}
Expand Down Expand Up @@ -112,9 +116,8 @@ impl MorphemeProjection for NormalizedNouns {
}

fn conjugating_matcher<D: DictionaryAccess>(dic: &D) -> PosMatcher {
make_matcher(dic, |pos| match pos[0].deref() {
"動詞" | "形容詞" | "助動詞" => true,
_ => false,
make_matcher(dic, |pos| {
matches!(pos[0].deref(), "動詞" | "形容詞" | "助動詞")
})
}

Expand Down
9 changes: 4 additions & 5 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,7 +21,6 @@ use std::sync::Arc;
use pyo3::prelude::*;

use sudachi::analysis::stateful_tokenizer::StatefulTokenizer;

use sudachi::dic::subset::InfoSubset;
use sudachi::prelude::*;

Expand Down Expand Up @@ -145,7 +144,7 @@ impl PyTokenizer {
None => None,
Some(m) => Some(extract_mode(py, m)?),
};
let default_mode = mode.map(|m| self.tokenizer.set_mode(m.into()));
let default_mode = mode.map(|m| self.tokenizer.set_mode(m));
let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| {
default_mode.map(|m| t.set_mode(m));
});
Expand All @@ -156,7 +155,7 @@ impl PyTokenizer {
tokenizer.do_tokenize()
});

err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?;
err.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e)))?;

let out_list = match out {
None => {
Expand All @@ -177,7 +176,7 @@ impl PyTokenizer {

morphemes
.collect_results(tokenizer.deref_mut())
.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e.to_string())))?;
.map_err(|e| SudachiPyErr::new_err(format!("Tokenization error: {}", e)))?;

Ok(out_list)
}
Expand Down
8 changes: 4 additions & 4 deletions sudachi-cli/src/build.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -172,12 +172,12 @@ fn output_file(p: &Path) -> File {
OpenOptions::new()
.write(true)
.create_new(true)
.open(&p)
.open(p)
.unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e))
}

fn dump_part(dict: PathBuf, part: String, output: PathBuf) {
let file = File::open(&dict).expect("open failed");
let file = File::open(dict).expect("open failed");
let data = unsafe { Mmap::map(&file) }.expect("mmap failed");
let loader =
unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary");
Expand Down Expand Up @@ -215,7 +215,7 @@ fn dump_matrix<W: Write>(grammar: &Grammar, w: &mut W) {
for left in 0..conn.num_left() {
for right in 0..conn.num_right() {
let cost = conn.cost(left as _, right as _);
write!(w, "{} {} {}\n", left, right, cost).unwrap();
writeln!(w, "{} {} {}", left, right, cost).unwrap();
}
}
}
Expand Down
Loading