diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7800425..e3ac562 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - lang: ["en", "de", "es"] # TODO: load this from build/languages.txt + lang: ["en", "de", "es"] steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 @@ -22,18 +22,9 @@ jobs: toolchain: stable target: wasm32-unknown-unknown - uses: Swatinem/rust-cache@v1 - - run: | - mkdir data - mkdir storage - - cd data - wget https://f000.backblazeb2.com/file/nlprule/${{ matrix.lang }}.zip - unzip ${{ matrix.lang }}.zip - - name: Build source - uses: actions-rs/cargo@v1 - with: - command: build - args: --all-features + - name: Build and test language + run: | + bash scripts/build_and_test.sh ${{ matrix.lang }} xx - name: Build source (WebAssembly) uses: actions-rs/cargo@v1 with: @@ -44,55 +35,18 @@ jobs: if: matrix.lang == 'en' with: token: ${{ secrets.GITHUB_TOKEN }} - args: --all-features - - name: Build binaries - uses: actions-rs/cargo@v1 - env: - RUST_LOG: INFO - with: - command: run - args: --all-features --bin compile -- --build-dir data/${{ matrix.lang }} --tokenizer-out storage/${{ matrix.lang }}_tokenizer.bin --rules-out storage/${{ matrix.lang }}_rules.bin + args: --features "binaries-en compile bin regex-all-test" - name: Run nlprule tests uses: actions-rs/cargo@v1 if: matrix.lang == 'en' with: command: test - args: --verbose --all-features --release - - name: Run disambiguation tests - uses: actions-rs/cargo@v1 - env: - RUST_LOG: WARN - with: - command: run - args: --all-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run disambiguation tests (with regex-fancy backend) - uses: actions-rs/cargo@v1 - if: matrix.lang == 'en' - env: - RUST_LOG: WARN - with: - command: run - args: --manifest-path nlprule/Cargo.toml --features "bin regex-onig" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run disambiguation tests (with regex-onig backend) - uses: actions-rs/cargo@v1 - if: matrix.lang == 'en' - env: - RUST_LOG: WARN - with: - command: run - args: --manifest-path nlprule/Cargo.toml --features "bin regex-fancy" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run grammar rule tests - uses: actions-rs/cargo@v1 - env: - RUST_LOG: WARN - with: - command: run - args: --all-features --bin test -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin --rules storage/${{ matrix.lang }}_rules.bin + args: --verbose --features "binaries-en" --release - name: Upload binaries as artifact uses: actions/upload-artifact@v2 with: name: binaries - path: storage/* + path: nlprule/storage/* matrix_prep: runs-on: ubuntu-latest @@ -107,145 +61,145 @@ jobs: # inputFile: '.github/workflows/matrix_includes.json' # Default input file path filter: '[?runOnEvent==`${{ github.event_name }}` || runOnEvent==`always`]' - python: - needs: [matrix_prep, rust] - strategy: - matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}} - runs-on: ${{ matrix.os }} - container: ${{ matrix.container }} - env: - working-directory: python - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously) - with: - profile: minimal - toolchain: stable - - uses: Swatinem/rust-cache@v1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - uses: actions/download-artifact@v2 - with: - name: binaries - path: storage - - name: Install GSED (if needed) # needed by set_version.sh - if: matrix.os == 'macos-latest' - run: | - brew install gnu-sed - - name: Update version (if release) - if: github.event_name == 'release' - run: | - bash scripts/set_version.sh ${{ github.event.release.tag_name }} - - name: Build and Test - run: | - # pybin is the directory with python binaries - PYBIN=${{ matrix.pybin }} + # python: + # needs: [matrix_prep, rust] + # strategy: + # matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}} + # runs-on: ${{ matrix.os }} + # container: ${{ matrix.container }} + # env: + # working-directory: python + # steps: + # - uses: actions/checkout@v2 + # - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously) + # with: + # profile: minimal + # toolchain: stable + # - uses: Swatinem/rust-cache@v1 + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v1 + # with: + # python-version: ${{ matrix.python-version }} + # - uses: actions/download-artifact@v2 + # with: + # name: binaries + # path: storage + # - name: Install GSED (if needed) # needed by set_version.sh + # if: matrix.os == 'macos-latest' + # run: | + # brew install gnu-sed + # - name: Update version (if release) + # if: github.event_name == 'release' + # run: | + # bash scripts/set_version.sh ${{ github.event.release.tag_name }} + # - name: Build and Test + # run: | + # # pybin is the directory with python binaries + # PYBIN=${{ matrix.pybin }} - if [ -z "${PYBIN}" ]; then - PIP_CMD="python -m pip" - PYTHON_CMD="python" - PYTEST_CMD="python -m pytest" - export MATURIN_CMD="maturin" - else - PIP_CMD="${PYBIN}/pip" - PYTHON_CMD="${PYBIN}/python" - PYTEST_CMD="${PYBIN}/pytest" - export MATURIN_CMD="${PYBIN}/maturin" - fi + # if [ -z "${PYBIN}" ]; then + # PIP_CMD="python -m pip" + # PYTHON_CMD="python" + # PYTEST_CMD="python -m pytest" + # export MATURIN_CMD="maturin" + # else + # PIP_CMD="${PYBIN}/pip" + # PYTHON_CMD="${PYBIN}/python" + # PYTEST_CMD="${PYBIN}/pytest" + # export MATURIN_CMD="${PYBIN}/maturin" + # fi - # if pybin is set, the venv will not be used - # still create it here for convenience since we need it on windows - ${PYTHON_CMD} -m venv venv - . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS - ${PIP_CMD} install --upgrade pip - ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2 + # # if pybin is set, the venv will not be used + # # still create it here for convenience since we need it on windows + # ${PYTHON_CMD} -m venv venv + # . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS + # ${PIP_CMD} install --upgrade pip + # ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2 - # remove potentially cached wheels - rm target/wheels/* || true - bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014 + # # remove potentially cached wheels + # rm target/wheels/* || true + # bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014 - # install the wheel in two different ways: - # 1. via pip: needed on manylinux - # 2. via maturin develop: needed on windows in venv - ${PIP_CMD} install $(ls target/wheels/* | head -n1) - bash scripts/maturin.sh develop --release + # # install the wheel in two different ways: + # # 1. via pip: needed on manylinux + # # 2. via maturin develop: needed on windows in venv + # ${PIP_CMD} install $(ls target/wheels/* | head -n1) + # bash scripts/maturin.sh develop --release - ${PYTEST_CMD} python/test.py -s - shell: bash - - name: Upload wheel as artifact - uses: actions/upload-artifact@v2 - with: - name: python-wheel - path: target/wheels/* + # ${PYTEST_CMD} python/test.py -s + # shell: bash + # - name: Upload wheel as artifact + # uses: actions/upload-artifact@v2 + # with: + # name: python-wheel + # path: target/wheels/* - publish: - runs-on: ubuntu-latest - needs: [rust, python] - if: github.event_name == 'release' + # publish: + # runs-on: ubuntu-latest + # needs: [rust, python] + # if: github.event_name == 'release' - steps: - - uses: actions/checkout@v2 - with: - ref: ${{ github.head_ref }} - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - uses: actions/download-artifact@v2 - with: - name: python-wheel - path: python-wheel - - uses: actions/download-artifact@v2 - with: - name: binaries - path: storage - - run: | - gzip storage/en_tokenizer.bin - gzip storage/en_rules.bin - gzip storage/de_tokenizer.bin - gzip storage/de_rules.bin - gzip storage/es_tokenizer.bin - gzip storage/es_rules.bin - - name: Update version - run: | - bash scripts/set_version.sh ${{ github.event.release.tag_name }} - - name: Publish on crates.io - run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out - cargo login $CARGO_KEY + # steps: + # - uses: actions/checkout@v2 + # with: + # ref: ${{ github.head_ref }} + # - name: Set up Python 3.8 + # uses: actions/setup-python@v1 + # with: + # python-version: 3.8 + # - uses: actions/download-artifact@v2 + # with: + # name: python-wheel + # path: python-wheel + # - uses: actions/download-artifact@v2 + # with: + # name: binaries + # path: storage + # - run: | + # gzip storage/en_tokenizer.bin + # gzip storage/en_rules.bin + # gzip storage/de_tokenizer.bin + # gzip storage/de_rules.bin + # gzip storage/es_tokenizer.bin + # gzip storage/es_rules.bin + # - name: Update version + # run: | + # bash scripts/set_version.sh ${{ github.event.release.tag_name }} + # - name: Publish on crates.io + # run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out + # cargo login $CARGO_KEY - cd nlprule - cp ../README.md README.md - cargo publish --allow-dirty - rm README.md - cd .. + # cd nlprule + # cp ../README.md README.md + # cargo publish --allow-dirty + # rm README.md + # cd .. - # allow crates.io index to update s. t. nlprule-build can depend on nlprule - sleep 1m + # # allow crates.io index to update s. t. nlprule-build can depend on nlprule + # sleep 1m - cd build - cargo publish --allow-dirty - cd .. - env: - CARGO_KEY: ${{ secrets.CARGO_KEY }} - - name: Publish on PyPI - run: | - pip install twine==3.3 - twine upload python-wheel/* - env: - TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} - - name: Upload release binaries - uses: alexellis/upload-assets@0.2.2 - env: - GITHUB_TOKEN: ${{ github.token }} - with: - asset_paths: '["./storage/*"]' - - run: | - rm -r python-wheel - rm -r storage - - uses: stefanzweifel/git-auto-commit-action@v4 - with: - commit_message: v${{ github.event.release.tag_name }} - branch: main + # cd build + # cargo publish --allow-dirty + # cd .. + # env: + # CARGO_KEY: ${{ secrets.CARGO_KEY }} + # - name: Publish on PyPI + # run: | + # pip install twine==3.3 + # twine upload python-wheel/* + # env: + # TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + # TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} + # - name: Upload release binaries + # uses: alexellis/upload-assets@0.2.2 + # env: + # GITHUB_TOKEN: ${{ github.token }} + # with: + # asset_paths: '["./storage/*"]' + # - run: | + # rm -r python-wheel + # rm -r storage + # - uses: stefanzweifel/git-auto-commit-action@v4 + # with: + # commit_message: v${{ github.event.release.tag_name }} + # branch: main diff --git a/Cargo.toml b/Cargo.toml index a8efc09..d636835 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [workspace] members = [ "nlprule", - "build", "python" ] diff --git a/build/Cargo.toml b/build/Cargo.toml deleted file mode 100644 index 726af20..0000000 --- a/build/Cargo.toml +++ /dev/null @@ -1,26 +0,0 @@ -[package] -name = "nlprule-build" -version = "0.6.3" -authors = ["Benjamin Minixhofer ", "Bernhard Schuster "] -edition = "2018" -license = "MIT OR Apache-2.0" -description = "Build tools for a fast, low-resource Natural Language Processing and Error Correction library." -repository = "https://github.com/bminixhofer/nlprule" -keywords = ["text", "spelling", "language-processing", "nlp", "grammar"] -categories = ["science", "text-processing"] - -[dependencies] -flate2 = "1" -thiserror = "1" -zip = "0.5.9" -directories = "3" -reqwest = { version = "0.11", default_features = false, features = ["blocking", "rustls-tls"] } -nlprule = { path = "../nlprule", features = ["compile"], version = "0.6.3" } # BUILD_BINDINGS_COMMENT -# nlprule = { package = "nlprule-core", path = "../nlprule", features = ["compile"] } # BUILD_BINDINGS_UNCOMMENT -fs-err = "2.5" - -[dev-dependencies] -tempdir = "0.3" -smush = "0.1.5" -env_logger = "0.8" -nlprule_030 = { package = "nlprule", version = "0.3.0" } diff --git a/build/README.md b/build/README.md index c826acc..db2c7ec 100644 --- a/build/README.md +++ b/build/README.md @@ -1,34 +1,6 @@ # nlprule-build -This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). It also provides: -1. Utility functions to download the binaries from their distribution source. -2. Scripts to create the nlprule build directories. - -## Development - -If you are using a development version of nlprule, the builder can build the binaries itself (instead of just fetching them): - -```rust -let nlprule_builder = nlprule_build::BinaryBuilder::new( - &["en"], - std::env::var("OUT_DIR").expect("OUT_DIR is set when build.rs is running"), -) -// this specifies that the binaries should be built if they are not found -.fallback_to_build_dir(true) -.build() -.validate(); -``` - -In that case, you should set - -```toml -[profile.dev] -build-override = { opt-level = 2 } -``` - -in your `Cargo.toml`. Building can be slow otherwise. - -The following has information how to acquire the nlpruile build directories and how to build and test the nlprule binaries. As a user you will typically not need to do this. +Utilities for creating build resources. ### Building and testing the nlprule binaries diff --git a/build/languages.txt b/build/languages.txt deleted file mode 100644 index f1723af..0000000 --- a/build/languages.txt +++ /dev/null @@ -1,3 +0,0 @@ -de -en -es \ No newline at end of file diff --git a/build/src/lib.rs b/build/src/lib.rs deleted file mode 100644 index 5dd85f8..0000000 --- a/build/src/lib.rs +++ /dev/null @@ -1,756 +0,0 @@ -//! This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). -//! See `README.md` for details. - -use flate2::bufread::GzDecoder; -use fs::File; -use fs_err as fs; -use nlprule::{compile, rules_filename, tokenizer_filename}; -use std::fs::Permissions; -use std::{ - io::{self, BufReader, BufWriter, Cursor, Read}, - path::{Path, PathBuf}, - result, -}; -use zip::result::ZipError; - -pub type OtherError = Box; - -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error(transparent)] - RequestError(#[from] reqwest::Error), - #[error("Binaries were not found on the remote")] - BinariesNotFound, - #[error("Failed to validate {1:?} binary for lang {0}")] - ValidationFailed(String, Binary, #[source] nlprule::Error), - #[error(transparent)] - IoError(#[from] io::Error), - #[error(transparent)] - ZipError(#[from] ZipError), - #[error("error postprocessing binaries: {0}")] - PostprocessingError(#[source] OtherError), - #[error("error transforming binaries: {0}")] - TransformError(#[source] OtherError), - #[error("Collation failed")] - CollationFailed(#[source] nlprule::compile::Error), -} - -pub type Result = result::Result; - -/// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer binaries. -pub type TransformDataFn = Box) -> result::Result<(), OtherError>>; - -/// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer binaries. -pub type TransformPathFn = Box result::Result>; - -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] -pub enum Binary { - Tokenizer, - Rules, -} - -impl Binary { - fn filename(&self, lang_code: &str) -> String { - match &self { - Binary::Tokenizer => tokenizer_filename(lang_code), - Binary::Rules => rules_filename(lang_code), - } - } -} - -/// Tries downloading the binaries from their distribution source. -/// -/// This implicitly unpacks the originally gzip'd sources and returns -/// an in-memory buffer. -fn obtain_binary_from_github_release( - version: &str, - lang_code: &str, - binary: Binary, -) -> Result> { - let filename = binary.filename(lang_code); - - let bytes = reqwest::blocking::get(&format!( - "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", - version, filename - ))? - .error_for_status() - .map_err(|e| { - if let Some(404) = e.status().map(|x| x.as_u16()) { - Error::BinariesNotFound - } else { - e.into() - } - })? - .bytes()?; - - let mut gz = GzDecoder::new(&bytes[..]); - let mut buffer = Vec::new(); - gz.read_to_end(&mut buffer)?; - - Ok(buffer) -} - -fn construct_cache_path( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, -) -> Result> { - let filename = binary.filename(lang_code); - - cache_dir - .map(move |dir| { - let path = dir.join(version).join(lang_code).join(&filename); - Ok(if let Some(transform_path_fn) = transform_path_fn { - transform_path_fn(path).map_err(Error::TransformError)? - } else { - path - }) - }) - .transpose() -} - -/// Returns the bytes for a binary which are either obtained -/// from the on-disk cache or from the distribution source. -/// If the on-disk cache is disabled or is not present, -/// it will attempt to download it via [`obtain_binary_from_github_release`]. -/// Also updates the cache. -/// -/// If `transform_data_fn` is set, the bytes returned from this function are the output -/// of `transform_data_fn` applied to the binencoded binaries. -fn obtain_binary_cache_or_github( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, - transform_data_fn: Option<&TransformDataFn>, -) -> Result> { - let cache_path = - construct_cache_path(version, lang_code, binary, cache_dir, transform_path_fn)?; - - // if the file can be read, the data is already cached and the transform was applied before - if let Some(ref cache_path) = cache_path { - if let Ok(bytes) = fs::read(cache_path) { - return Ok(bytes); - } - } - - // the binencoded data from github - let bytes_binenc = obtain_binary_from_github_release(version, lang_code, binary)?; - - // apply the transform if any to an intermediate buffer - let bytes_transformed = if let Some(transform_data_fn) = transform_data_fn { - let mut intermediate = Vec::::new(); - transform_data_fn(bytes_binenc.as_slice(), &mut intermediate) - .map_err(Error::TransformError)?; - intermediate - } else { - bytes_binenc - }; - - // update the cache entry - if let Some(ref cache_path) = cache_path { - fs::create_dir_all(cache_path.parent().expect("path must have parent"))?; - let mut cache_file = fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(cache_path)?; - io::copy(&mut bytes_transformed.as_slice(), &mut cache_file)?; - } - - Ok(bytes_transformed) -} - -fn assure_binary_availability( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, - transform_data_fn: Option<&TransformDataFn>, - out: PathBuf, -) -> Result<()> { - let source = obtain_binary_cache_or_github( - version, - lang_code, - binary, - cache_dir, - transform_path_fn, - transform_data_fn, - )?; - - let mut out_file = fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(out)?; - io::copy(&mut source.as_slice(), &mut out_file)?; - Ok(()) -} - -pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<()> { - let bytes = reqwest::blocking::get(&format!( - "https://f000.backblazeb2.com/file/nlprule/{}.zip", - lang_code - ))? - .error_for_status()? - .bytes()?; - - // extract the zip file and write to directory, a bit annoying that this is so verbose - // adapted from https://github.com/zip-rs/zip/blob/master/examples/extract.rs - let mut archive = zip::ZipArchive::new(Cursor::new(bytes))?; - - for i in 0..archive.len() { - let mut file = archive.by_index(i)?; - let outpath = match file.enclosed_name() { - Some(path) => out_dir - .as_ref() - // the first component of the path is the zip file name e. g. "en" so we skip it - .join(path.iter().skip(1).collect::()), - None => continue, - }; - - if (&*file.name()).ends_with('/') { - fs::create_dir_all(&outpath)?; - } else { - if let Some(p) = outpath.parent() { - if !p.exists() { - fs::create_dir_all(&p)?; - } - } - let mut outfile = fs::File::create(&outpath)?; - io::copy(&mut file, &mut outfile)?; - } - - // Get and Set permissions - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - - if let Some(mode) = file.unix_mode() { - fs::set_permissions(&outpath, Permissions::from_mode(mode))?; - } - } - } - - Ok(()) -} - -/// Gets the language codes for the currently supported languages in ISO 639-1 (two-letter) format e. g. "en". -pub fn supported_language_codes() -> Vec<&'static str> { - include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/", "languages.txt")) - .lines() - .collect() -} - -/// Places all nlprule binaries for the given languages in some directory. -pub struct BinaryBuilder { - language_codes: Vec, - out_dir: PathBuf, - version: String, - cache_dir: Option, - fallback_to_build_dir: bool, - build_dir: Option, - outputs: Vec, - transform_path_fn: Option, - transform_data_fn: Option, -} - -impl BinaryBuilder { - /// ```plain - /// github release resource --[fn transform]--> $cache_dir --[fn postprocess]--> $OUT_DIR/ - /// ``` - /// - /// Acquires the rule and tokenizer binaries for one language by: - /// - Trying to download them from their distribution source (or load them local cache). - /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir` is true - /// downloads the latest build directory and builds the binaries from it. - /// This can still fail if the dev version is sufficiently outdated for the latest build dir. - /// In that case, the user is encouraged to update to a release or a newer git sha. - fn build_language(&mut self, lang_code: &str) -> Result<()> { - // adjust the destination path - let path_transform = |out: PathBuf| -> Result { - Ok( - if let Some(ref transform_path_fn) = self.transform_path_fn { - transform_path_fn(out).map_err(Error::TransformError)? - } else { - out - }, - ) - }; - - let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?; - let rules_out = path_transform(self.out_dir.join(rules_filename(lang_code)))?; - - let mut did_not_find_binaries = false; - - for (binary, out) in &[ - (Binary::Tokenizer, &tokenizer_out), - (Binary::Rules, &rules_out), - ] { - let out = out.to_owned().to_owned(); - match assure_binary_availability( - &self.version, - lang_code, - *binary, - self.cache_dir.as_ref(), - self.transform_path_fn.as_ref(), - self.transform_data_fn.as_ref(), - out, - ) { - Err(Error::BinariesNotFound) => { - did_not_find_binaries = true; - break; - } - res => res?, - } - } - - if did_not_find_binaries && self.fallback_to_build_dir { - // it is possible that the build dirs are cached too long i. e. not downloaded again although a new version is available - // this could lead to problems but is not easy to fix so it will stay this way unless problems are reported - let build_dir = self - .build_dir.as_ref() - .unwrap_or_else( - || self.cache_dir.as_ref().expect("need somewhere to store build dirs: either `cache_dir` or `build_dir_path` must be set if `fallback_to_build_dir` is true."), - ) - .join(lang_code); - - if !build_dir.exists() { - get_build_dir(lang_code, &build_dir).expect("error loading build directory"); - } - - let mut rules_sink = BufWriter::new( - fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(&rules_out)?, - ); - let mut tokenizer_sink = BufWriter::new( - fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(&tokenizer_out)?, - ); - if let Some(ref transform_data_fn) = self.transform_data_fn { - let mut transfer_buffer_rules = Vec::new(); - let mut transfer_buffer_tokenizer = Vec::new(); - - compile::compile( - build_dir, - &mut transfer_buffer_rules, - &mut transfer_buffer_tokenizer, - ) - .map_err(Error::CollationFailed)?; - - assert_ne!(transfer_buffer_rules.len(), 0); - assert_ne!(transfer_buffer_tokenizer.len(), 0); - - let mut transformed_buffer_rules = Vec::new(); - let mut transformed_buffer_tokenizer = Vec::new(); - - transform_data_fn( - transfer_buffer_rules.as_slice(), - &mut transformed_buffer_rules, - ) - .map_err(Error::TransformError)?; - transform_data_fn( - transfer_buffer_tokenizer.as_slice(), - &mut transformed_buffer_tokenizer, - ) - .map_err(Error::TransformError)?; - } else { - compile::compile(build_dir, &mut rules_sink, &mut tokenizer_sink) - .map_err(Error::CollationFailed)?; - }; - } else if did_not_find_binaries { - panic!( - "Did not find binaries for version {}. \ - If this is a development version, try setting `fallback_to_build_dir` to build the binaries yourself. \ - If this is a release, this should NOT happen.", - self.version - ); - } - - self.outputs.push(tokenizer_out); - self.outputs.push(rules_out); - Ok(()) - } - - /// Creates a new binary builder. `language_codes` must be in ISO 639-1 (two-letter) format. - /// If `language_codes` is `&[]`, uses all supported languages. - /// If this is used in a `build.rs`, `out_dir` should probably be the OUT_DIR environment variable. - pub fn new>(language_codes: &[&str], out_dir: P) -> Self { - let language_codes: Vec<_> = if language_codes.is_empty() { - supported_language_codes() - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - language_codes - .iter() - .map(ToOwned::to_owned) - .map(ToOwned::to_owned) - .collect::>() - }; - - let project_dir = directories::ProjectDirs::from("", "", "nlprule"); - // this should be CARGO_ARTIFACT_DIR once it is merged: https://github.com/rust-lang/rfcs/pull/3035 - let cache_dir = project_dir.as_ref().map(|x| x.cache_dir().to_owned()); - let build_dir = cache_dir.as_ref().map(|x| x.join("build_dirs")); - - let version = env!("CARGO_PKG_VERSION").to_owned(); - - BinaryBuilder { - language_codes, - out_dir: out_dir.as_ref().to_owned(), - version, - cache_dir, - fallback_to_build_dir: false, - build_dir, - outputs: Vec::new(), - transform_data_fn: None, - transform_path_fn: None, - } - } - - /// Sets the version for which to fetch binaries. - /// The version of `nlprule-build` (kept in sync with `nlprule` version) by default. - /// Typically does not need to be modified. - pub fn version>(mut self, version: S) -> Self { - self.version = version.into(); - self - } - - /// Sets the out directory. - pub fn out_dir(mut self, out_dir: PathBuf) -> Self { - self.out_dir = out_dir; - self - } - - /// Sets the cache directory. The user cache directory at e. g. `~/.cache/nlprule` by default. - pub fn cache_dir(mut self, cache_dir: Option) -> Self { - self.cache_dir = cache_dir; - self - } - - /// Sets whether to fallback to building from the build directory if no distributed binaries are found - /// (i. e. a development version of nlprule is used). - pub fn fallback_to_build_dir(mut self, fallback_to_build_dir: bool) -> Self { - self.fallback_to_build_dir = fallback_to_build_dir; - self - } - - /// Sets the path the build directories should be stored at. - /// Only relevant if `fallback_to_build_dir` is true. - /// `cache_dir.join("build_dirs")` by default. - pub fn build_dir(mut self, build_dir: Option) -> Self { - self.build_dir = build_dir; - self - } - - /// Builds by {downloading, copying, building} the binaries to the out directory. - pub fn build(mut self) -> Result { - self.language_codes - .clone() - .into_iter() - .try_for_each(|lang_code| self.build_language(&lang_code))?; - Ok(self) - } - - /// Validates the binaries by checking if they can be loaded by nlprule. - pub fn validate(&self) -> Result<()> { - for lang_code in &self.language_codes { - let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); - let rules_out = self.out_dir.join(rules_filename(lang_code)); - - nlprule::Rules::new(rules_out) - .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?; - nlprule::Tokenizer::new(tokenizer_out) - .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?; - } - - Ok(()) - } - - /// Gets the paths to all files this builder created. - pub fn outputs(&self) -> &[PathBuf] { - &self.outputs - } - - /// Applies the given transformation function to the binary immediately after obtaining it. - /// This happens before placing the file in the cache (if any) so by using a compression - /// function the size of the cache directory can be reduced. - /// Modifies the path of the cached binaries by the given `path_fn`. - /// If no cache directory is set or the binaries are built from the build dir, the `path_fn` does nothing. - /// - /// The resulting files will then reside in the given cache dir if any. - /// - /// Attention: Any compression applied here, must be undone in the - /// `fn postprocess` provided closure to retain the original binenc file - /// to be consumed by the application code. - pub fn transform(mut self, proc_fn: D, path_fn: P) -> Self - where - // these signatures have to match the `TransformDataFn` and `TransformPathFn` types - D: Fn(&[u8], &mut Vec) -> result::Result<(), OtherError> + 'static, - P: Fn(PathBuf) -> result::Result + 'static, - { - self.transform_data_fn = Some(Box::new(proc_fn)); - self.transform_path_fn = Some(Box::new(path_fn)); - self - } - - /// Applies the given postprocessing function to the binaries e. g. for compression. - /// Modifies the output path by the given path function. - /// - /// # Example - /// - /// ```rust - /// # use nlprule_build::BinaryBuilder; - /// # use std::io::Write; - /// # let tempdir = tempdir::TempDir::new("builder_test")?; - /// # let tempdir = tempdir.path(); - /// # - /// # let mut builder = BinaryBuilder::new(&["en"], tempdir).version("0.3.0"); - /// builder - /// .build()? - /// .postprocess( - /// |reader, mut writer| { - /// let mut encoder = flate2::read::GzEncoder::new(reader, flate2::Compression::default()); - /// std::io::copy(&mut encoder, &mut writer)?; - /// Ok(()) - /// }, - /// |p| { - /// let mut path = p.as_os_str().to_os_string(); - /// path.push(".gz"); - /// path - /// }, - /// )?; - /// # Ok::<(), nlprule_build::Error>(()) - /// ``` - pub fn postprocess(mut self, proc_fn: C, path_fn: F) -> Result - where - C: Fn(BufReader, BufWriter) -> result::Result<(), OtherError>, - F: Fn(PathBuf) -> P, - P: AsRef, - { - for (i, path) in self.outputs.clone().into_iter().enumerate() { - let reader = BufReader::new(fs::File::open(&path)?); - - let new_path = path_fn(path.clone()); - let new_path = new_path.as_ref(); - - let writer = BufWriter::new(File::create(new_path)?); - - proc_fn(reader, writer).map_err(Error::PostprocessingError)?; - - if new_path != path { - self.outputs[i] = new_path.to_path_buf(); - fs::remove_file(path)?; - } - } - - Ok(self) - } -} - -#[cfg(test)] -mod tests { - use io::Write; - - use super::*; - - #[test] - fn getting_binary_works() -> Result<()> { - // this is nice to keep roughly in sync with the latest released version but it is not necessary - let tempdir = tempdir::TempDir::new("build_dir")?; - let tempdir = tempdir.path().join("foo.bin"); - assure_binary_availability("0.3.0", "en", Binary::Rules, None, None, None, tempdir)?; - - Ok(()) - } - - #[test] - fn getting_build_dir_works() -> Result<()> { - let _ = env_logger::builder().is_test(true).try_init(); - - let tempdir = tempdir::TempDir::new("build_dir_test")?; - let tempdir = tempdir.path(); - - get_build_dir("en", &tempdir)?; - - assert_eq!(fs::read_to_string(tempdir.join("lang_code.txt"))?, "en"); - - Ok(()) - } - - // TODO: causes problems in CI, maybe remove `fallback_to_build_dir` altogether? - // #[test] - // fn binary_builder_works() -> Result<()> { - // let tempdir = tempdir::TempDir::new("builder_test")?; - // let tempdir = tempdir.path(); - - // BinaryBuilder::new(&["en"], tempdir) - // .cache_dir(Some(tempdir.to_path_buf())) - // .fallback_to_build_dir(true) - // .build()? - // .validate()?; - - // Ok(()) - // } - - #[test] - fn binary_builder_works_with_released_version() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()?; - - Ok(()) - } - - #[test] - fn binary_builder_works_with_smush() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()? - .postprocess( - |mut buffer, mut writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - Ok(writer.write_all(&smush::encode( - &tmp, - smush::Codec::Gzip, - smush::Quality::Default, - )?)?) - }, - |p| { - let mut path = p.as_os_str().to_os_string(); - path.push(".gz"); - path - }, - )?; - - let tokenizer_path = tempdir - .join(Path::new(&tokenizer_filename("en"))) - .with_extension("bin.gz"); - assert!(tokenizer_path.exists()); - let decoded = smush::decode(&fs::read(tokenizer_path)?, smush::Codec::Gzip).unwrap(); - - let _ = nlprule_030::Tokenizer::new_from(&mut decoded.as_slice()).unwrap(); - - Ok(()) - } - - #[test] - fn binary_builder_works_with_flate2() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - let builder = BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()? - .postprocess( - |mut buffer, writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - Ok( - flate2::write::GzEncoder::new(writer, flate2::Compression::default()) - .write_all(&tmp)?, - ) - }, - |p| { - let mut path = p.as_os_str().to_os_string(); - path.push(".gz"); - path - }, - )?; - - assert_eq!( - builder.outputs(), - &[ - tempdir.join("en_tokenizer.bin.gz"), - tempdir.join("en_rules.bin.gz") - ] - ); - - let rules_path = tempdir - .join(Path::new(&rules_filename("en"))) - .with_extension("bin.gz"); - assert!(rules_path.exists()); - - let encoded = fs::read(rules_path)?; - let mut decoder = flate2::read::GzDecoder::new(&encoded[..]); - - let mut decoded = Vec::new(); - decoder.read_to_end(&mut decoded).unwrap(); - - let _ = nlprule_030::Rules::new_from(&mut decoded.as_slice()).unwrap(); - - Ok(()) - } - - #[test] - fn build_with_zstd_transform() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - let builder = BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .transform( - |buffer, writer| { - let data = smush::encode(buffer, smush::Codec::Zstd, smush::Quality::Maximum)?; - writer.write_all(&data)?; - Ok(()) - }, - |p: PathBuf| { - let mut s = p.to_string_lossy().to_string(); - s.push_str(".zstd"); - Ok(PathBuf::from(s)) - }, - ) - .build()? - .postprocess( - |mut buffer, mut writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?; - writer.write_all(data.as_slice())?; - Ok(()) - }, - |p| { - let path = p.to_string_lossy(); - assert!(path.ends_with(".zstd")); - let end = path.len().saturating_sub(".zstd".len()); - assert_ne!(end, 0); - path[..end].to_owned() - }, - )?; - - assert_eq!( - builder.outputs(), - &[ - tempdir.join("en_tokenizer.bin"), - tempdir.join("en_rules.bin") - ] - ); - - let rules_path = tempdir - .join(Path::new(&rules_filename("en"))) - .with_extension("bin"); - assert!(rules_path.is_file()); - - let _ = nlprule_030::Rules::new(rules_path).unwrap(); - Ok(()) - } -} diff --git a/data/de/disambiguator_options.json b/data/de/disambiguator_options.json new file mode 100644 index 0000000..92bc9fc --- /dev/null +++ b/data/de/disambiguator_options.json @@ -0,0 +1,7 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "DISAMBIGUATION/SUB_BEAMTE/1", + "DISAMBIGUATION/SUB_BEAMTE/2" + ] +} \ No newline at end of file diff --git a/nlprule/configs/de/rules.json b/data/de/rules_options.json similarity index 100% rename from nlprule/configs/de/rules.json rename to data/de/rules_options.json diff --git a/nlprule/configs/de/tagger.json b/data/de/tagger_options.json similarity index 100% rename from nlprule/configs/de/tagger.json rename to data/de/tagger_options.json diff --git a/nlprule/configs/en/tokenizer.json b/data/de/tokenizer_options.json similarity index 75% rename from nlprule/configs/en/tokenizer.json rename to data/de/tokenizer_options.json index 11eae18..0e8eb37 100644 --- a/nlprule/configs/en/tokenizer.json +++ b/data/de/tokenizer_options.json @@ -1,8 +1,4 @@ { - "allow_errors": false, - "ignore_ids": [ - "DISAMBIGUATION/BEST_JJS/0" - ], "extra_join_regexes": [ "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" ] diff --git a/data/en/disambiguator_options.json b/data/en/disambiguator_options.json new file mode 100644 index 0000000..e76a2cc --- /dev/null +++ b/data/en/disambiguator_options.json @@ -0,0 +1,6 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "DISAMBIGUATION/BEST_JJS/0" + ] +} \ No newline at end of file diff --git a/nlprule/configs/en/rules.json b/data/en/rules_options.json similarity index 100% rename from nlprule/configs/en/rules.json rename to data/en/rules_options.json diff --git a/nlprule/configs/en/tagger.json b/data/en/tagger_options.json similarity index 100% rename from nlprule/configs/en/tagger.json rename to data/en/tagger_options.json diff --git a/nlprule/configs/de/tokenizer.json b/data/en/tokenizer_options.json similarity index 67% rename from nlprule/configs/de/tokenizer.json rename to data/en/tokenizer_options.json index df21ad8..0e8eb37 100644 --- a/nlprule/configs/de/tokenizer.json +++ b/data/en/tokenizer_options.json @@ -1,9 +1,4 @@ { - "allow_errors": false, - "ignore_ids": [ - "DISAMBIGUATION/SUB_BEAMTE/1", - "DISAMBIGUATION/SUB_BEAMTE/2" - ], "extra_join_regexes": [ "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" ] diff --git a/data/es/disambiguator_options.json b/data/es/disambiguator_options.json new file mode 100644 index 0000000..8305874 --- /dev/null +++ b/data/es/disambiguator_options.json @@ -0,0 +1,4 @@ +{ + "allow_errors": false, + "ignore_ids": [] +} \ No newline at end of file diff --git a/nlprule/configs/es/rules.json b/data/es/rules_options.json similarity index 100% rename from nlprule/configs/es/rules.json rename to data/es/rules_options.json diff --git a/nlprule/configs/es/tagger.json b/data/es/tagger_options.json similarity index 100% rename from nlprule/configs/es/tagger.json rename to data/es/tagger_options.json diff --git a/nlprule/configs/es/tokenizer.json b/data/es/tokenizer_options.json similarity index 89% rename from nlprule/configs/es/tokenizer.json rename to data/es/tokenizer_options.json index 402aa9b..10cac1f 100644 --- a/nlprule/configs/es/tokenizer.json +++ b/data/es/tokenizer_options.json @@ -1,6 +1,4 @@ { - "allow_errors": true, - "ignore_ids": [], "extra_split_chars": [ "-", "─", diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index ff6628c..36bce79 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -53,21 +53,22 @@ quickcheck = "1.0" quickcheck_macros = "1.0" criterion = "0.3" -[build-dependencies] -serde_json = "1" -fs-err = "2.5" - [[bench]] name = "load" harness = false +required-features = ["binaries"] [features] default = ["regex-onig"] +binaries-de = [] +binaries-en = [] +binaries-es = [] +binaries-all = ["binaries-de", "binaries-en", "binaries-es"] + regex-onig = ["onig"] # to switch to the fancy-regex engine, disable default features and add this feature regex-fancy = ["fancy-regex"] - # this enables both regex backends at the same time and makes sure they are equivalent # used only for compilation and tests regex-all-test = ["regex-onig", "regex-fancy"] @@ -89,13 +90,17 @@ name = "compile" required-features = ["compile", "bin"] [[bin]] -name = "test" -required-features = ["bin"] +name = "test_en" +required-features = ["bin", "binaries-en"] [[bin]] -name = "run" -required-features = ["bin"] +name = "test_es" +required-features = ["bin", "binaries-es"] [[bin]] -name = "test_disambiguation" +name = "test_de" +required-features = ["bin", "binaries-de"] + +[[bin]] +name = "run" required-features = ["bin"] diff --git a/nlprule/benches/load.rs b/nlprule/benches/load.rs index 0e0c010..9dab5a8 100644 --- a/nlprule/benches/load.rs +++ b/nlprule/benches/load.rs @@ -1,17 +1,13 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use nlprule::{Rules, Tokenizer}; +use criterion::{criterion_group, criterion_main, Criterion}; +use nlprule::lang::en; use std::time::Duration; fn parse_tokenizer(c: &mut Criterion) { - c.bench_function("load tokenizer", |b| { - b.iter(|| Tokenizer::new(black_box("../storage/en_tokenizer.bin")).unwrap()) - }); + c.bench_function("load tokenizer", |b| b.iter(en::analyzer)); } fn parse_rules(c: &mut Criterion) { - c.bench_function("load rules", |b| { - b.iter(|| Rules::new(black_box("../storage/en_rules.bin")).unwrap()) - }); + c.bench_function("load rules", |b| b.iter(en::rules)); } fn no_warmup_criterion() -> Criterion { diff --git a/nlprule/build.rs b/nlprule/build.rs deleted file mode 100644 index 8eb2ad1..0000000 --- a/nlprule/build.rs +++ /dev/null @@ -1,62 +0,0 @@ -//! Compiles the language build configurations in configs/ into two files (one for the tokenizer, one for the rules) -//! so they can be inlined. These configs are included at compile time because they define the neccessary parameters to -//! run the rules for a language correctly. They are NOT user configuration. - -use fs::File; -use fs_err as fs; -use std::{collections::HashMap, io::BufWriter, path::Path}; - -fn main() { - let path = env!("CARGO_MANIFEST_DIR"); - let path = Path::new(path).join("configs"); - - let out_dir = - std::env::var("OUT_DIR").expect("OUT_DIR env var must be set when build.rs is run"); - let out_dir = Path::new(&out_dir); - - println!("cargo:rerun-if-changed={}", path.display()); - - for (filename, joined_filename) in &[ - ("tokenizer.json", "tokenizer_configs.json"), - ("rules.json", "rules_configs.json"), - ("tagger.json", "tagger_configs.json"), - ] { - let mut config_map: HashMap = HashMap::new(); - - for entry in fs::read_dir(&path).expect("must be able to read config dir") { - let entry = entry.expect("must be able to read config dir entry"); - - println!("cargo:rerun-if-changed={}", entry.path().display()); - - if entry.path().is_dir() { - let lang_code = entry - .path() - .file_name() - .expect("directory must have name") - .to_str() - .expect("directory name must be unicode") - .to_string(); - - let path = entry.path().join(filename); - - println!("cargo:rerun-if-changed={}", path.display()); - - let json_str = fs::read_to_string(path) - .unwrap_or_else(|_| panic!("{} for 'lang_code' must exist", filename)); - - config_map.insert( - lang_code, - serde_json::from_str(&json_str) - .unwrap_or_else(|_| panic!("{} for language must be valid json", filename)), - ); - } - } - - let config_writer = BufWriter::new( - File::create(out_dir.join(joined_filename)) - .expect("must be able to create file in out dir"), - ); - serde_json::to_writer_pretty(config_writer, &config_map) - .expect("must be able to write JSON to file"); - } -} diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 6f63412..f17cea4 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,7 +1,19 @@ use clap::Clap; +use fs::{File, OpenOptions}; use fs_err as fs; -use nlprule::compile::{compile, Error}; -use std::io::BufWriter; + +use log::{info, warn}; +use nlprule::compile::{BuildComponent, BuildInfo, Error}; +use nlprule::components::{ + chunker::Chunker, + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tagger::Tagger, + tokenizer::Tokenizer, + Component, +}; +use serde::{Deserialize, Serialize}; +use std::path::Path; use std::path::PathBuf; #[derive(clap::Clap)] @@ -13,17 +25,96 @@ pub struct BuildOptions { #[clap(long, parse(from_os_str))] pub build_dir: PathBuf, #[clap(long, parse(from_os_str))] - pub tokenizer_out: PathBuf, - #[clap(long, parse(from_os_str))] - pub rules_out: PathBuf, + pub out_dir: PathBuf, +} + +#[derive(Serialize, Deserialize)] +struct BuildFilePaths { + lang_code: PathBuf, + tag_dict: Vec, + tag_remove_dict: Vec, + chunker: PathBuf, + disambiguator_xml: PathBuf, + rules_xml: PathBuf, + multiword_tags: PathBuf, + common_words: PathBuf, + regex_cache: PathBuf, + srx: PathBuf, + tagger_options: PathBuf, + rules_options: PathBuf, + tokenizer_options: PathBuf, + disambiguator_options: PathBuf, +} + +impl BuildFilePaths { + fn new>(build_dir: P) -> Self { + let p = build_dir.as_ref(); + BuildFilePaths { + lang_code: p.join("lang_code.txt"), + tag_dict: vec![p.join("tags/output.dump"), p.join("tags/added.txt")], + tag_remove_dict: vec![p.join("tags/removed.txt")], + chunker: p.join("chunker.json"), + disambiguator_xml: p.join("disambiguation.xml"), + rules_xml: p.join("grammar.xml"), + multiword_tags: p.join("tags/multiwords.txt"), + common_words: p.join("common.txt"), + regex_cache: p.join("regex_cache.bin"), + srx: p.join("segment.srx"), + tagger_options: p.join("tagger_options.json"), + rules_options: p.join("rules_options.json"), + tokenizer_options: p.join("tokenizer_options.json"), + disambiguator_options: p.join("disambiguator_options.json"), + } + } } fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); + let paths = BuildFilePaths::new(opts.build_dir); + + fs::create_dir_all(&opts.out_dir)?; + + let paths_value = serde_json::to_value(&paths)?; + + let tagger = Tagger::build(serde_json::from_value(paths_value.clone())?, None)?; + let mut build_info = BuildInfo::new(&tagger, &paths.regex_cache)?; + + macro_rules! build { + ($component:ty) => { + info!("Creating component \"{}\".", <$component>::name()); + let instance_result = <$component>::build( + serde_json::from_value(paths_value.clone())?, + Some(&mut build_info), + ); + + match instance_result { + Ok(instance) => { + instance.to_writer( + &OpenOptions::new() + .write(true) + .create(true) + .open(opts.out_dir.join(format!("{}.bin", <$component>::name())))?, + )?; + } + Err(error) => { + warn!("Error creating \"{0}\": {1}. This is expected if the component does not exist for this language.", <$component>::name(), error); + } + } + }; + } + + build!(Tokenizer); + build!(Disambiguator); + build!(MultiwordTagger); + build!(Chunker); + build!(Rules); - let tokenizer_sink = BufWriter::new(fs::File::create(&opts.tokenizer_out)?); - let rules_sink = BufWriter::new(fs::File::create(&opts.rules_out)?); + // write the regex cache at the end, otherwise it isn't fully populated + bincode::serialize_into( + &File::create(&paths.regex_cache)?, + build_info.mut_regex_cache(), + )?; - compile(opts.build_dir, rules_sink, tokenizer_sink) + Ok(()) } diff --git a/nlprule/src/bin/run.rs b/nlprule/src/bin/run.rs index 1b4258c..d8c419d 100644 --- a/nlprule/src/bin/run.rs +++ b/nlprule/src/bin/run.rs @@ -1,28 +1,29 @@ -use clap::Clap; -use nlprule::{rules::Rules, tokenizer::Tokenizer}; +// use clap::Clap; +// use nlprule::{rules::Rules, tokenizer::Tokenizer}; -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - text: String, - #[clap(long, short)] - tokenizer: String, - #[clap(long, short)] - rules: String, -} +// #[derive(Clap)] +// #[clap( +// version = "1.0", +// author = "Benjamin Minixhofer " +// )] +// struct Opts { +// text: String, +// #[clap(long, short)] +// tokenizer: String, +// #[clap(long, short)] +// rules: String, +// } -fn main() { - env_logger::init(); - let opts = Opts::parse(); +fn main() {} +// fn main() { +// env_logger::init(); +// let opts = Opts::parse(); - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules = Rules::new(opts.rules).unwrap(); +// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); +// let rules = Rules::new(opts.rules).unwrap(); - let tokens = tokenizer.pipe(&opts.text); +// let tokens = tokenizer.pipe(&opts.text); - println!("Tokens: {:#?}", tokens.collect::>()); - println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer)); -} +// println!("Tokens: {:#?}", tokens.collect::>()); +// println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer)); +// } diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs deleted file mode 100644 index 3669a8e..0000000 --- a/nlprule/src/bin/test.rs +++ /dev/null @@ -1,41 +0,0 @@ -use clap::Clap; -use nlprule::{rules::Rules, tokenizer::Tokenizer}; - -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - #[clap(long, short)] - tokenizer: String, - #[clap(long, short)] - rules: String, - #[clap(long, short)] - ids: Vec, -} - -fn main() { - env_logger::init(); - let opts = Opts::parse(); - - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules_container = Rules::new(opts.rules).unwrap(); - let rules = rules_container.rules(); - - println!("Runnable rules: {}", rules.len()); - - let mut passes = 0; - for rule in rules { - if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) { - passes += rule.test(&tokenizer) as usize; - } - } - - println!("Rules passing tests: {}", passes); - if passes == rules.len() { - std::process::exit(0); - } else { - std::process::exit(1); - } -} diff --git a/nlprule/src/bin/test_de.rs b/nlprule/src/bin/test_de.rs new file mode 100644 index 0000000..af75f27 --- /dev/null +++ b/nlprule/src/bin/test_de.rs @@ -0,0 +1,6 @@ +use nlprule::lang::de; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + de::correcter().test() +} diff --git a/nlprule/src/bin/test_disambiguation.rs b/nlprule/src/bin/test_disambiguation.rs deleted file mode 100644 index 30321a3..0000000 --- a/nlprule/src/bin/test_disambiguation.rs +++ /dev/null @@ -1,42 +0,0 @@ -use clap::Clap; -use nlprule::tokenizer::Tokenizer; - -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - #[clap(long)] - stop_at_error: bool, - #[clap(long, short)] - tokenizer: String, -} - -fn main() { - env_logger::init(); - let opts = Opts::parse(); - - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules = tokenizer.rules(); - - println!("Last ID: {}", rules[rules.len() - 1].id()); - println!("Runnable rules: {}", rules.len()); - - let mut passes = 0; - - for rule in rules { - if rule.test(&tokenizer) { - passes += 1; - } else if opts.stop_at_error { - break; - } - } - - println!("Rules passing tests: {}", passes); - if passes == rules.len() { - std::process::exit(0); - } else { - std::process::exit(1); - } -} diff --git a/nlprule/src/bin/test_en.rs b/nlprule/src/bin/test_en.rs new file mode 100644 index 0000000..f7268fc --- /dev/null +++ b/nlprule/src/bin/test_en.rs @@ -0,0 +1,6 @@ +use nlprule::lang::en; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + en::correcter().test() +} diff --git a/nlprule/src/bin/test_es.rs b/nlprule/src/bin/test_es.rs new file mode 100644 index 0000000..c6c3ace --- /dev/null +++ b/nlprule/src/bin/test_es.rs @@ -0,0 +1,6 @@ +use nlprule::lang::es; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + es::correcter().test() +} diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs deleted file mode 100644 index ec7fd7d..0000000 --- a/nlprule/src/compile/impls.rs +++ /dev/null @@ -1,822 +0,0 @@ -use bimap::BiMap; -use fs_err::File; -use log::warn; -use serde::{Deserialize, Serialize}; -use std::{ - collections::{HashMap, HashSet}, - hash::{Hash, Hasher}, - io::{self, BufRead, BufReader}, - path::Path, -}; - -use crate::{ - rule::{ - disambiguation::PosFilter, - engine::{ - composition::{GraphId, Matcher, PosMatcher, TextMatcher}, - Engine, - }, - id::Category, - DisambiguationRule, Rule, - }, - rules::{Rules, RulesLangOptions}, - tokenizer::{ - chunk, - multiword::{MultiwordTagger, MultiwordTaggerFields}, - tag::{Tagger, TaggerLangOptions, WordIdMap}, - Tokenizer, TokenizerLangOptions, - }, - types::*, - utils::{parallelism::MaybeParallelIterator, regex::Regex}, -}; - -use super::{parse_structure::BuildInfo, Error}; - -impl Tagger { - fn get_lines, S2: AsRef>( - paths: &[S1], - remove_paths: &[S2], - ) -> std::io::Result> { - let mut output = Vec::new(); - let mut disallowed: Vec = Vec::new(); - - for path in remove_paths { - let file = File::open(path.as_ref())?; - let reader = std::io::BufReader::new(file); - - for line in reader.lines() { - let line = line?; - if line.starts_with('#') { - continue; - } - - disallowed.push(line.to_string()); - } - } - - for path in paths { - let file = File::open(path.as_ref())?; - let reader = std::io::BufReader::new(file); - - for line in reader.lines() { - let line = line?; - if line.starts_with('#') { - continue; - } - - if disallowed.contains(&line) { - continue; - } - - let parts: Vec<_> = line.split('\t').collect(); - - let word = parts[0].to_string(); - let inflection = parts[1].to_string(); - let tag = parts[2].to_string(); - - output.push((word, inflection, tag)) - } - } - - Ok(output) - } - - /// Creates a tagger from raw files. - /// - /// # Arguments - /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively, - /// separated by tabs, to be added to the tagger. - /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively, - /// separated by tabs, to be removed from the tagger if present in the files from `paths`. - pub(in crate::compile) fn from_dumps, S2: AsRef>( - paths: &[S1], - remove_paths: &[S2], - common_words: &HashSet, - lang_options: TaggerLangOptions, - ) -> std::io::Result { - let mut tag_store = HashSet::new(); - let mut word_store = HashSet::new(); - - // add language specific special tags - tag_store.extend(lang_options.extra_tags.iter().map(|x| x.as_str())); - - let lines = Tagger::get_lines(paths, remove_paths)?; - - let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~"; - for i in 0..punct.len() { - word_store.insert(&punct[i..(i + 1)]); - } - - word_store.extend(common_words.iter().map(|x| x.as_str())); - - for (word, inflection, tag) in lines.iter() { - word_store.insert(word); - word_store.insert(inflection); - tag_store.insert(tag); - } - - // the empty string must not be part of any wordlist - assert!(!word_store.contains("")); - - // word store ids should be consistent across runs - let mut word_store: Vec<_> = word_store.into_iter().collect(); - word_store.sort_unstable(); - - // add special empty string to wordlist, must be the first element to have id 0 - word_store.insert(0, ""); - - // tag store ids should be consistent across runs - let mut tag_store: Vec<_> = tag_store.into_iter().collect(); - tag_store.sort_unstable(); - - // add special part of speech tags, they must have ids starting from zero - for (i, special_pos) in SpecialPos::iter().enumerate() { - tag_store.insert(i, special_pos); - } - - let word_store: BiMap<_, _> = word_store - .iter() - .enumerate() - .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32))) - .collect(); - let tag_store: BiMap<_, _> = tag_store - .iter() - .enumerate() - .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16))) - .collect(); - - let mut tags: Vec>> = vec![None; word_store.len()]; - - for (word, inflection, tag) in lines.iter() { - let word_id = word_store.get_by_left(word).unwrap(); - let lemma_id = word_store.get_by_left(inflection).unwrap(); - let pos_id = tag_store.get_by_left(tag).unwrap(); - - match &mut tags[word_id.value() as usize] { - Some(vec) => { - vec.push((*lemma_id, *pos_id)); - } - None => { - tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]); - } - } - } - - Ok(Tagger { - tags: WordIdMap(tags), - word_store, - tag_store, - lang_options, - }) - } -} - -impl MultiwordTagger { - pub(in crate::compile) fn from_dump>( - dump: P, - info: &BuildInfo, - ) -> Result { - let reader = BufReader::new(File::open(dump.as_ref())?); - let mut multiwords = Vec::new(); - - for line in reader.lines() { - let line = line?; - - // strip comments - let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim(); - if line.is_empty() { - continue; - } - let tab_split: Vec<_> = line.split('\t').collect(); - - let word: String = tab_split[0] - .split_whitespace() - .collect::>() - .join(" "); - let pos = info.tagger().id_tag(tab_split[1]).into_static(); - multiwords.push((word, pos)); - } - - Ok((MultiwordTaggerFields { multiwords }).into()) - } -} - -impl TextMatcher { - pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self { - // can not cache a matcher that depends on the graph - let set = if matcher.graph_id().is_some() { - None - } else if let either::Right(regex) = &matcher.matcher { - let mut hasher = DefaultHasher::default(); - regex.hash(&mut hasher); - matcher.negate.hash(&mut hasher); - matcher.empty_always_false.hash(&mut hasher); - let matcher_hash = hasher.finish(); - - if let Some(set) = info.mut_regex_cache().get(&matcher_hash) { - set.clone() - } else { - let data: Vec<_> = info.tagger().word_store().iter().collect(); - - let set: DefaultHashSet<_> = data - .into_maybe_par_iter() - .filter_map(|(word, id)| { - if matcher.is_match(word.as_str(), None, None) { - Some(*id) - } else { - None - } - }) - .collect(); - - // there are some regexes which match lots of strings - // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up - // the vast majority of regexes matches less than 100 strings from manual inspection - let set = if set.len() > 100 { None } else { Some(set) }; - info.mut_regex_cache().insert(matcher_hash, set.clone()); - set - } - } else { - None - }; - - TextMatcher { matcher, set } - } -} - -impl PosMatcher { - pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self { - let mut mask = vec![false; info.tagger().tag_store().len()]; - - for (word, id) in info.tagger().tag_store().iter() { - mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None); - } - - PosMatcher { mask } - } -} - -impl Rules { - pub(in crate::compile) fn from_xml>( - path: P, - build_info: &mut BuildInfo, - options: RulesLangOptions, - ) -> Self { - let rules = super::parse_structure::read_rules(path); - let mut errors: HashMap = HashMap::new(); - - let rules: Vec<_> = rules - .into_iter() - .filter_map(|x| match x { - Ok((rule_structure, group, category)) => { - let category = category.expect("grammar rules must have category"); - let id = Category::new(category.id.as_str()); - - let id = if let Some(group) = &group { - id.join(group.id.as_str()).join(group.n) - } else { - id.join( - rule_structure - .id - .as_ref() - .expect("ID must be set if not in group."), - ) - .join(0) - }; - - let rule_on = match rule_structure.default.as_deref() { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let category_on = match category.default.as_deref() { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let name = rule_structure.name.as_ref().map_or_else( - || { - let group = group.as_ref().expect("must have group if name not set"); - group.name.clone() - }, - |x| x.clone(), - ); - - match Rule::from_rule_structure(rule_structure, build_info) { - Ok(mut rule) => { - if (options.ids.is_empty() - || options.ids.iter().any(|x| x.is_match(&id))) - && !options.ignore_ids.iter().any(|x| x.is_match(&id)) - { - rule.id = id; - rule.name = name; - rule.category_name = category.name; - rule.category_type = category.kind; - rule.enabled = category_on && group_on && rule_on; - Some(rule) - } else { - None - } - } - Err(x) => { - *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1; - None - } - } - } - Err(x) => { - *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1; - None - } - }) - .collect(); - - if !errors.is_empty() { - let mut errors: Vec<(String, usize)> = errors.into_iter().collect(); - errors.sort_by_key(|x| -(x.1 as i32)); - - warn!( - "Errors constructing Rules: {:#?}", - &errors - .iter() - .map(|(message, number)| format!("{} (n={})", message, number)) - .collect::>() - ); - } - - Rules { rules } - } -} - -impl Tokenizer { - pub(in crate::compile) fn from_xml>( - path: P, - build_info: &mut BuildInfo, - chunker: Option, - multiword_tagger: Option, - sentencizer: srx::Rules, - lang_options: TokenizerLangOptions, - ) -> Result { - let rules = super::parse_structure::read_disambiguation_rules(path); - let mut error = None; - - let rules: Vec<_> = rules - .into_iter() - .filter_map(|x| match x { - Ok((rule_structure, group, _)) => { - let id = Category::new("DISAMBIGUATION"); - - let id = if let Some(group) = &group { - id.join(group.id.as_str()).join(group.n) - } else { - id.join( - rule_structure - .id - .as_ref() - .expect("ID must be set if not in group."), - ) - .join(0) - }; - - match DisambiguationRule::from_rule_structure(rule_structure, build_info) { - Ok(mut rule) => { - if error.is_none() - && (lang_options.ids.is_empty() - || lang_options.ids.iter().any(|x| x.is_match(&id))) - && !lang_options.ignore_ids.iter().any(|x| x.is_match(&id)) - { - rule.id = id; - - Some(rule) - } else { - None - } - } - Err(x) => { - if error.is_none() { - error = Some(format!("[Rule] {}", x)); - } - None - } - } - } - Err(x) => { - if error.is_none() { - error = Some(format!("[Structure] {}", x)); - } - None - } - }) - .collect(); - - if let Some(x) = error { - if lang_options.allow_errors { - warn!("Error constructing Disambiguator: {}", x) - } else { - return Err(Error::Unexpected(format!( - "Error constructing Disambiguator: {}", - x - ))); - } - } - - Ok(Tokenizer { - tagger: build_info.tagger().clone(), - sentencizer, - chunker, - multiword_tagger, - rules, - lang_options, - }) - } -} - -#[derive(Deserialize)] -struct ModelData { - outcome_labels: Vec, - pmap: DefaultHashMap, -} - -#[derive(Serialize, Deserialize)] -pub(in crate::compile) struct ContextData { - parameters: Vec, - outcomes: Vec, -} - -impl From for chunk::Model { - fn from(data: ModelData) -> Self { - let mut outcomes: Vec = Vec::new(); - let mut parameters: Vec = Vec::new(); - - let pmap = data - .pmap - .into_iter() - .map(|(key, value)| { - assert_eq!(value.outcomes.len(), value.parameters.len()); - - let offset = outcomes.len(); - let length = value.outcomes.len(); - - outcomes.extend(value.outcomes); - parameters.extend(value.parameters); - - (chunk::hash::hash_str(&key), (offset, length)) - }) - .collect::>(); - - chunk::Model { - outcome_labels: data.outcome_labels, - outcomes, - parameters, - pmap, - } - } -} - -impl chunk::Chunker { - pub(in crate::compile) fn from_json( - reader: R, - ) -> Result { - #[derive(Deserialize)] - struct ChunkData { - token_model: ModelData, - pos_model: ModelData, - pos_tagdict: DefaultHashMap>, - chunk_model: ModelData, - } - - let chunk_data: ChunkData = serde_json::from_reader(reader)?; - Ok(chunk::Chunker { - token_model: chunk::MaxentTokenizer { - model: chunk_data.token_model.into(), - }, - pos_model: chunk::MaxentPosTagger { - model: chunk_data.pos_model.into(), - tagdict: chunk_data.pos_tagdict, - }, - chunk_model: chunk::MaxentChunker { - model: chunk_data.chunk_model.into(), - }, - }) - } -} - -impl PosFilter { - pub(in crate::compile) fn new(matcher: PosMatcher) -> Self { - PosFilter { matcher } - } -} - -impl Regex { - pub(in crate::compile) fn from_java_regex( - java_regex_str: &str, - full_match: bool, - case_sensitive: bool, - ) -> Result { - let regex_string = - super::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?; - - let regex = Regex::new(regex_string); - if let Err(error) = regex.try_compile() { - return Err(Error::Regex(error)); - } - - Ok(regex) - } -} - -impl Engine { - pub(in crate::compile) fn to_graph_id(&self, id: usize) -> Result { - let mut id = GraphId(id); - - let map = match &self { - Engine::Token(engine) => &engine.composition.id_to_idx, - Engine::Text(_, id_to_idx) => &id_to_idx, - }; - - let max_id = *map - .keys() - .max() - .ok_or_else(|| Error::Unexpected("graph is empty".into()))?; - - // ideally this should throw an error but LT is more lenient than nlprule - if !map.contains_key(&id) { - id = max_id; - } - - Ok(id) - } -} - -mod composition { - use super::*; - use crate::{ - rule::engine::composition::{ - AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part, - Quantifier, TrueAtom, - }, - utils::regex::Regex, - }; - - impl Atom { - fn iter_mut<'a>(&'a mut self) -> Box + 'a> { - match self { - Atom::ChunkAtom(_) - | Atom::SpaceBeforeAtom(_) - | Atom::TextAtom(_) - | Atom::WordDataAtom(_) - | Atom::FalseAtom(_) - | Atom::TrueAtom(_) => Box::new(std::iter::once(self)), - Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()), - Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()), - Atom::NotAtom(x) => x.atom.iter_mut(), - Atom::OffsetAtom(x) => x.atom.iter_mut(), - } - } - - pub(in crate::compile) fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> { - let mut ids = Vec::new(); - - for atom in self.iter_mut() { - let id = match atom { - Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(), - Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(), - Atom::WordDataAtom(atom) => atom - .matcher - .inflect_matcher - .as_mut() - .and_then(|x| x.matcher.mut_graph_id()), - _ => { - continue; - } - }; - - if let Some(id) = id { - ids.push(id); - } - } - - ids - } - } - - impl Matcher { - pub(in crate::compile) fn new_regex( - regex: Regex, - negate: bool, - empty_always_false: bool, - ) -> Self { - Matcher { - matcher: either::Right(regex), - negate, - case_sensitive: true, // handled by regex, should maybe be an option - empty_always_false, - } - } - - pub(in crate::compile) fn new_string( - string_or_idx: either::Either, - negate: bool, - case_sensitive: bool, - empty_always_false: bool, - ) -> Self { - Matcher { - matcher: either::Left(string_or_idx), - negate, - case_sensitive, - empty_always_false, - } - } - - pub(in crate::compile) fn graph_id(&self) -> Option { - if let either::Left(either::Right(id)) = &self.matcher { - Some(*id) - } else { - None - } - } - - pub(in crate::compile) fn mut_graph_id(&mut self) -> Option<&mut GraphId> { - if let either::Left(either::Right(id)) = &mut self.matcher { - Some(id) - } else { - None - } - } - } - - impl Quantifier { - pub(in crate::compile) fn new(min: usize, max: usize) -> Self { - assert!(max >= min); - Quantifier { min, max } - } - } - - impl AndAtom { - pub(in crate::compile) fn and(atoms: Vec) -> Atom { - let mut atoms: Vec<_> = atoms - .into_iter() - .filter(|x| !matches!(x, Atom::TrueAtom { .. })) - .collect(); - - if atoms.is_empty() { - (TrueAtom {}).into() - } else if atoms.len() == 1 { - atoms.remove(0) - } else { - (AndAtom { atoms }).into() - } - } - } - - impl OrAtom { - pub(in crate::compile) fn or(atoms: Vec) -> Atom { - let mut atoms: Vec<_> = atoms - .into_iter() - .filter(|x| !matches!(x, Atom::FalseAtom { .. })) - .collect(); - - if atoms.is_empty() { - (FalseAtom {}).into() - } else if atoms.len() == 1 { - atoms.remove(0) - } else { - (OrAtom { atoms }).into() - } - } - } - - impl NotAtom { - pub(in crate::compile) fn not(atom: Atom) -> Atom { - match atom { - Atom::TrueAtom { .. } => FalseAtom::default().into(), - Atom::FalseAtom { .. } => TrueAtom::default().into(), - x => (NotAtom { atom: Box::new(x) }).into(), - } - } - } - - impl OffsetAtom { - pub(in crate::compile) fn new(atom: Atom, offset: isize) -> Self { - OffsetAtom { - atom: Box::new(atom), - offset, - } - } - } - - impl Composition { - pub(in crate::compile) fn new(mut parts: Vec) -> Result { - let mut id_to_idx = DefaultHashMap::default(); - id_to_idx.insert(GraphId(0), 0); - let mut current_id = 1; - - for (i, part) in parts.iter().enumerate() { - if part.visible { - id_to_idx.insert(GraphId(current_id), i + 1); - current_id += 1; - } - } - - let can_stop_mask = (0..parts.len()) - .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0)) - .collect(); - - for (i, part) in parts.iter_mut().enumerate() { - for id in part.atom.mut_graph_ids() { - loop { - let index = *id_to_idx.get(&id).ok_or_else(|| { - Error::Unexpected(format!("id must exist in graph: {:?}", id)) - })?; - - // ideally this should throw an error but LT is more lenient than nlprule - if index > i { - *id = GraphId(id.0 - 1); - } else { - break; - } - } - } - } - - Ok(Composition { - parts, - id_to_idx, - can_stop_mask, - }) - } - } -} - -pub(in crate::compile) mod filters { - use super::Error; - use std::collections::HashMap; - - use crate::{filter::*, rule::engine::Engine, utils::regex::Regex}; - - trait FromArgs: Sized { - fn from_args(args: HashMap, engine: &Engine) -> Result; - } - - impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter { - fn from_args(args: HashMap, engine: &Engine) -> Result { - if args.contains_key("negate_postag") { - panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter"); - } - - Ok(NoDisambiguationEnglishPartialPosTagFilter { - id: engine.to_graph_id(args - .get("no") - .ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument" - .into(), - ) - })? - .parse::()?)?, - regexp: Regex::from_java_regex( - &args.get("regexp").ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument" - .into(), - ) - })?, - true, - true, - )?, - postag_regexp: Regex::from_java_regex( - &args.get("postag_regexp").ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument" - .into(), - ) - })?, - true, - true, - )?, - negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"), - }) - } - } - - pub(in crate::compile) fn get_filter( - name: &str, - args: HashMap, - engine: &Engine, - ) -> Result { - match name { - "NoDisambiguationEnglishPartialPosTagFilter" => { - Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into()) - } - _ => Err(Error::Unexpected(format!("unsupported filter {}", name))), - } - } -} diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index c1258e1..965978c 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -1,62 +1,19 @@ -//! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md. - -use fs::File; -use fs_err as fs; - use std::{ hash::{Hash, Hasher}, - io::{self, BufReader, BufWriter}, + io::BufReader, num::ParseIntError, - path::{Path, PathBuf}, - str::FromStr, - sync::Arc, + path::Path, }; -use crate::{ - rules::Rules, - tokenizer::{chunk::Chunker, multiword::MultiwordTagger, tag::Tagger, Tokenizer}, - types::DefaultHasher, -}; -use log::info; - -use self::parse_structure::{BuildInfo, RegexCache}; -use thiserror::Error; +pub mod utils; -mod impls; -mod parse_structure; -mod structure; -mod utils; - -struct BuildFilePaths { - lang_code_path: PathBuf, - tag_paths: Vec, - tag_remove_paths: Vec, - chunker_path: PathBuf, - disambiguation_path: PathBuf, - grammar_path: PathBuf, - multiword_tag_path: PathBuf, - common_words_path: PathBuf, - regex_cache_path: PathBuf, - srx_path: PathBuf, -} +use crate::components::tagger::Tagger; -impl BuildFilePaths { - fn new>(build_dir: P) -> Self { - let p = build_dir.as_ref(); - BuildFilePaths { - lang_code_path: p.join("lang_code.txt"), - tag_paths: vec![p.join("tags/output.dump"), p.join("tags/added.txt")], - tag_remove_paths: vec![p.join("tags/removed.txt")], - chunker_path: p.join("chunker.json"), - disambiguation_path: p.join("disambiguation.xml"), - grammar_path: p.join("grammar.xml"), - multiword_tag_path: p.join("tags/multiwords.txt"), - common_words_path: p.join("common.txt"), - regex_cache_path: p.join("regex_cache.bin"), - srx_path: p.join("segment.srx"), - } - } -} +use crate::types::*; +use fs_err::File; +use log::info; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; +use thiserror::Error; #[derive(Error, Debug)] #[allow(missing_docs)] @@ -71,8 +28,6 @@ pub enum Error { Json(#[from] serde_json::Error), #[error(transparent)] Srx(#[from] srx::Error), - #[error("language options do not exist for '{lang_code}'")] - LanguageOptionsDoNotExist { lang_code: String }, #[error(transparent)] RegexSyntax(#[from] regex_syntax::ast::Error), #[error("regex compilation error: {0}")] @@ -83,119 +38,90 @@ pub enum Error { Unimplemented(String), #[error(transparent)] ParseError(#[from] ParseIntError), + #[error("`BuildInfo` is required to build this component, but is unset.")] + BuildInfoUnset, #[error("unknown error: {0}")] Other(#[from] Box), } -/// Compiles the binaries from a build directory. -pub fn compile( - build_dir: impl AsRef, - rules_dest: impl io::Write, - tokenizer_dest: impl io::Write, -) -> Result<(), Error> { - let paths = BuildFilePaths::new(&build_dir); - - let lang_code = fs::read_to_string(paths.lang_code_path)?; - - info!( - "Reading common words from {}.", - paths.common_words_path.display() - ); - let common_words = fs::read_to_string(paths.common_words_path)? - .lines() - .map(|x| x.to_string()) - .collect(); - - let tokenizer_lang_options = utils::tokenizer_lang_options(&lang_code).ok_or_else(|| { - Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), +pub trait BuildComponent: Sized { + type Paths: DeserializeOwned; + + fn build(paths: Self::Paths, build_info: Option<&mut BuildInfo>) -> Result; +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct RegexCache { + cache: DefaultHashMap>>, + // this is compared with the hash of the word store of the tagger + word_hash: u64, +} + +impl RegexCache { + pub fn new(word_hash: u64) -> Self { + RegexCache { + cache: DefaultHashMap::default(), + word_hash, } - })?; - - let rules_lang_options = - utils::rules_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), - })?; - - let tagger_lang_options = - utils::tagger_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), - })?; - - info!("Creating tagger."); - let tagger = Tagger::from_dumps( - &paths.tag_paths, - &paths.tag_remove_paths, - &common_words, - tagger_lang_options, - )?; - - let mut hasher = DefaultHasher::default(); - let mut word_store = tagger.word_store().iter().collect::>(); - word_store.sort_by(|a, b| a.1.cmp(b.1)); - word_store.hash(&mut hasher); - let word_store_hash = hasher.finish(); - - let regex_cache = if let Ok(file) = File::open(&paths.regex_cache_path) { - let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?; - if *cache.word_hash() == word_store_hash { + } + + pub fn word_hash(&self) -> &u64 { + &self.word_hash + } + + pub(crate) fn get(&self, key: &u64) -> Option<&Option>> { + self.cache.get(key) + } + + pub(crate) fn insert(&mut self, key: u64, value: Option>) { + self.cache.insert(key, value); + } +} + +pub struct BuildInfo<'a> { + tagger: &'a Tagger, + regex_cache: RegexCache, +} + +impl<'a> BuildInfo<'a> { + pub fn new>(tagger: &'a Tagger, regex_cache_path: P) -> Result { + let mut hasher = DefaultHasher::default(); + let mut word_store = tagger.word_store().iter().collect::>(); + word_store.sort_by(|a, b| a.1.cmp(b.1)); + word_store.hash(&mut hasher); + let word_store_hash = hasher.finish(); + + let regex_cache = if let Ok(file) = File::open(regex_cache_path.as_ref()) { + let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?; + if *cache.word_hash() == word_store_hash { + info!( + "Regex cache at {} is valid.", + regex_cache_path.as_ref().display() + ); + cache + } else { + info!("Regex cache was provided but is not valid. Rebuilding."); + RegexCache::new(word_store_hash) + } + } else { info!( - "Regex cache at {} is valid.", - paths.regex_cache_path.display() + "No regex cache provided. Building and writing to {}.", + regex_cache_path.as_ref().display() ); - cache - } else { - info!("Regex cache was provided but is not valid. Rebuilding."); RegexCache::new(word_store_hash) - } - } else { - info!( - "No regex cache provided. Building and writing to {}.", - paths.regex_cache_path.display() - ); - RegexCache::new(word_store_hash) - }; - - let mut build_info = BuildInfo::new(Arc::new(tagger), regex_cache); - let chunker = if paths.chunker_path.exists() { - info!("{} exists. Building chunker.", paths.chunker_path.display()); - let reader = BufReader::new(File::open(paths.chunker_path)?); - let chunker = Chunker::from_json(reader)?; - Some(chunker) - } else { - None - }; - let multiword_tagger = if paths.multiword_tag_path.exists() { - info!( - "{} exists. Building multiword tagger.", - paths.multiword_tag_path.display() - ); - Some(MultiwordTagger::from_dump( - paths.multiword_tag_path, - &build_info, - )?) - } else { - None - }; - - info!("Creating tokenizer."); - let tokenizer = Tokenizer::from_xml( - &paths.disambiguation_path, - &mut build_info, - chunker, - multiword_tagger, - srx::SRX::from_str(&fs::read_to_string(&paths.srx_path)?)?.language_rules(lang_code), - tokenizer_lang_options, - )?; - tokenizer.to_writer(tokenizer_dest)?; - - info!("Creating grammar rules."); - let rules = Rules::from_xml(&paths.grammar_path, &mut build_info, rules_lang_options); - rules.to_writer(rules_dest)?; - - // we need to write the regex cache after building the rules, otherwise it isn't fully populated - let f = BufWriter::new(File::create(&paths.regex_cache_path)?); - bincode::serialize_into(f, build_info.mut_regex_cache())?; - - Ok(()) + }; + + Ok(BuildInfo { + tagger, + regex_cache, + }) + } + + pub fn tagger(&self) -> &'a Tagger { + self.tagger + } + + pub fn mut_regex_cache(&mut self) -> &mut RegexCache { + &mut self.regex_cache + } } diff --git a/nlprule/src/compile/utils.rs b/nlprule/src/compile/utils.rs index 73b5322..53dab59 100644 --- a/nlprule/src/compile/utils.rs +++ b/nlprule/src/compile/utils.rs @@ -1,55 +1,3 @@ -use crate::{rules::RulesLangOptions, tokenizer::TokenizerLangOptions}; -use crate::{tokenizer::tag::TaggerLangOptions, types::*}; -use lazy_static::lazy_static; - -lazy_static! { - static ref TOKENIZER_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "tokenizer_configs.json" - ))) - .expect("tokenizer configs must be valid JSON") - }; -} - -lazy_static! { - static ref RULES_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "rules_configs.json" - ))) - .expect("rules configs must be valid JSON") - }; -} - -lazy_static! { - static ref TAGGER_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "tagger_configs.json" - ))) - .expect("tagger configs must be valid JSON") - }; -} - -/// Gets the tokenizer language options for the language code -pub(crate) fn tokenizer_lang_options(lang_code: &str) -> Option { - TOKENIZER_LANG_OPTIONS.get(lang_code).cloned() -} - -/// Gets the rules language options for the language code -pub(crate) fn rules_lang_options(lang_code: &str) -> Option { - RULES_LANG_OPTIONS.get(lang_code).cloned() -} - -/// Gets the tagger language options for the language code -pub(crate) fn tagger_lang_options(lang_code: &str) -> Option { - TAGGER_LANG_OPTIONS.get(lang_code).cloned() -} - pub(crate) use regex::from_java_regex; mod regex { diff --git a/nlprule/src/components/chunker/compile.rs b/nlprule/src/components/chunker/compile.rs new file mode 100644 index 0000000..73083fa --- /dev/null +++ b/nlprule/src/components/chunker/compile.rs @@ -0,0 +1,84 @@ +use std::{io::BufReader, path::PathBuf}; + +use fs_err::File; +use serde::Deserialize; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Serialize, Deserialize)] +struct ContextData { + parameters: Vec, + outcomes: Vec, +} + +#[derive(Deserialize)] +struct ModelData { + outcome_labels: Vec, + pmap: DefaultHashMap, +} + +impl From for Model { + fn from(data: ModelData) -> Self { + let mut outcomes: Vec = Vec::new(); + let mut parameters: Vec = Vec::new(); + + let pmap = data + .pmap + .into_iter() + .map(|(key, value)| { + assert_eq!(value.outcomes.len(), value.parameters.len()); + + let offset = outcomes.len(); + let length = value.outcomes.len(); + + outcomes.extend(value.outcomes); + parameters.extend(value.parameters); + + (hash::hash_str(&key), (offset, length)) + }) + .collect::>(); + + Model { + outcome_labels: data.outcome_labels, + outcomes, + parameters, + pmap, + } + } +} + +#[derive(Deserialize)] +pub struct Paths { + chunker: PathBuf, +} + +impl BuildComponent for Chunker { + type Paths = Paths; + + fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result { + #[derive(Deserialize)] + struct ChunkData { + token_model: ModelData, + pos_model: ModelData, + pos_tagdict: DefaultHashMap>, + chunk_model: ModelData, + } + + let chunk_data: ChunkData = + serde_json::from_reader(BufReader::new(File::open(paths.chunker)?))?; + Ok(Chunker { + token_model: MaxentTokenizer { + model: chunk_data.token_model.into(), + }, + pos_model: MaxentPosTagger { + model: chunk_data.pos_model.into(), + tagdict: chunk_data.pos_tagdict, + }, + chunk_model: MaxentChunker { + model: chunk_data.chunk_model.into(), + }, + }) + } +} diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/components/chunker/mod.rs similarity index 95% rename from nlprule/src/tokenizer/chunk.rs rename to nlprule/src/components/chunker/mod.rs index 40ae936..8b6aad2 100644 --- a/nlprule/src/tokenizer/chunk.rs +++ b/nlprule/src/components/chunker/mod.rs @@ -1,12 +1,19 @@ //! A Chunker ported from [OpenNLP](https://opennlp.apache.org/). +#[cfg(feature = "compile")] +mod compile; + use half::bf16; +use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; use std::hash::{Hash, Hasher}; use std::{cmp::Ordering, collections::BinaryHeap}; +use crate::properties::*; use crate::types::{DefaultHashMap, DefaultHasher, Sentence}; +use super::Component; + fn softmax(vec: &mut Vec) { for x in vec.iter_mut() { *x = x.exp(); @@ -699,9 +706,22 @@ pub struct Chunker { pub(crate) chunk_model: MaxentChunker, } -impl Chunker { - /// Populates the `.chunks` field of the passed tokens by predicting with the maximum entropy model. - pub fn apply(&self, sentence: &mut Sentence) { +impl Transform for Chunker { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default() + .read(&[Property::Tags]) + .write(&[Property::Chunks]); + } + *PROPERTIES + } + + fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + let props = self.property_guard(&mut sentence)?; + let text = sentence.text().replace('’', "\'"); let mut bi_to_ci: DefaultHashMap = text @@ -757,8 +777,12 @@ impl Chunker { let contains_nns = sentence .iter() .find(|token| *token.span().char() == char_span) - .map(|token| token.tags().iter().any(|tag| tag.pos().as_str() == "NNS")) - .unwrap_or(false); + .map(|token| { + props + .tags(token) + .map(|tags| tags.iter().any(|tag| tag.pos().as_str() == "NNS")) + }) + .unwrap_or(Ok(false))?; if contains_nns { number = "plural"; @@ -791,9 +815,17 @@ impl Chunker { for token in sentence.iter_mut() { for (chunk, (_, char_span)) in chunks.iter().zip(internal_chunks.iter()) { if char_span == token.span().char() { - *token.chunks_mut() = (*chunk).clone(); + *props.chunks_mut(token)? = (*chunk).clone(); } } } + + Ok(sentence) + } +} + +impl Component for Chunker { + fn name() -> &'static str { + "chunker" } } diff --git a/nlprule/src/components/mod.rs b/nlprule/src/components/mod.rs new file mode 100644 index 0000000..8cdb152 --- /dev/null +++ b/nlprule/src/components/mod.rs @@ -0,0 +1,30 @@ +use std::{ + io::{BufReader, Read, Write}, + path::Path, +}; + +use fs_err::File; +use serde::{de::DeserializeOwned, Serialize}; + +pub mod chunker; +pub mod multiword_tagger; +pub mod rules; +pub mod tagger; +pub mod tokenizer; + +pub trait Component: Serialize + DeserializeOwned + Clone { + fn name() -> &'static str; + + fn new>(p: P) -> Result { + let reader = BufReader::new(File::open(p.as_ref())?); + Self::from_reader(reader) + } + + fn from_reader(reader: R) -> Result { + Ok(bincode::deserialize_from(reader)?) + } + + fn to_writer(&self, writer: W) -> Result<(), crate::Error> { + Ok(bincode::serialize_into(writer, self)?) + } +} diff --git a/nlprule/src/components/multiword_tagger/compile.rs b/nlprule/src/components/multiword_tagger/compile.rs new file mode 100644 index 0000000..c02b959 --- /dev/null +++ b/nlprule/src/components/multiword_tagger/compile.rs @@ -0,0 +1,46 @@ +use std::{ + io::{BufRead, BufReader}, + path::PathBuf, +}; + +use fs_err::File; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Deserialize)] +pub struct Paths { + multiword_tags: PathBuf, +} + +impl BuildComponent for MultiwordTagger { + type Paths = Paths; + + fn build(paths: Paths, info: Option<&mut BuildInfo>) -> Result { + let tagger = info.ok_or(Error::BuildInfoUnset)?.tagger(); + + let reader = BufReader::new(File::open(paths.multiword_tags)?); + let mut multiwords = Vec::new(); + + for line in reader.lines() { + let line = line?; + + // strip comments + let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim(); + if line.is_empty() { + continue; + } + let tab_split: Vec<_> = line.split('\t').collect(); + + let word: String = tab_split[0] + .split_whitespace() + .collect::>() + .join(" "); + let pos = tagger.id_tag(tab_split[1]).into_static(); + multiwords.push((word, pos)); + } + + Ok((MultiwordTaggerFields { multiwords }).into()) + } +} diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/components/multiword_tagger/mod.rs similarity index 72% rename from nlprule/src/tokenizer/multiword.rs rename to nlprule/src/components/multiword_tagger/mod.rs index 9af2ca7..03518fe 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/components/multiword_tagger/mod.rs @@ -1,12 +1,19 @@ //! Checks if the input text contains multi-token phrases from a finite list (might contain e. g. city names) and assigns lemmas and part-of-speech tags accordingly. +use crate::properties::*; use crate::types::*; use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use super::Component; + +#[cfg(feature = "compile")] +mod compile; + #[derive(Serialize, Deserialize)] -pub(crate) struct MultiwordTaggerFields { - pub(crate) multiwords: Vec<(String, PosId<'static>)>, +struct MultiwordTaggerFields { + multiwords: Vec<(String, PosId<'static>)>, } impl From for MultiwordTagger { @@ -36,9 +43,20 @@ pub struct MultiwordTagger { multiwords: Vec<(String, PosId<'static>)>, } -impl MultiwordTagger { - /// Populates the `.multiword_data` field of the passed tokens by checking if any known phrases are contained. - pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) { +impl Transform for MultiwordTagger { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); + } + *PROPERTIES + } + + fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + let props = self.property_guard(&mut sentence)?; + let tagger = sentence.tagger(); let mut start_indices = DefaultHashMap::new(); @@ -50,7 +68,7 @@ impl MultiwordTagger { .enumerate() .map(|(i, x)| { start_indices.insert(byte_index, i); - byte_index += x.text().0.len(); + byte_index += x.as_str().len(); end_indices.insert(byte_index, i); byte_index += " ".len(); @@ -66,11 +84,19 @@ impl MultiwordTagger { let (word, pos) = &self.multiwords[m.pattern()]; // end index is inclusive for token in sentence.iter_mut().skip(*start).take((end + 1) - start) { - token.tags_mut().push( + props.tags_mut(token)?.push( WordData::new(tagger.id_word(word.as_str().into()), pos.clone()).freeze(), ); } } } + + Ok(sentence) + } +} + +impl Component for MultiwordTagger { + fn name() -> &'static str { + "multiword_tagger" } } diff --git a/nlprule/src/components/rules/compile/mod.rs b/nlprule/src/components/rules/compile/mod.rs new file mode 100644 index 0000000..e2d6cf2 --- /dev/null +++ b/nlprule/src/components/rules/compile/mod.rs @@ -0,0 +1,234 @@ +mod structure; + +use fs_err::File; +use std::{io::BufReader, path::PathBuf}; +use log::warn; + +use crate::{ + compile::{BuildComponent, BuildInfo, Error}, + rule::id::Category, +}; + +use super::*; + +/// Options for a disambiguator. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct DisambiguatorLangOptions { + /// Whether to allow errors while constructing the tokenizer. + pub allow_errors: bool, + /// Disambiguation Rule selectors to use in this tokenizer. + #[serde(default)] + pub ids: Vec, + /// Disambiguation Rule selectors to ignore in this tokenizer. + #[serde(default)] + pub ignore_ids: Vec, +} + +#[derive(Deserialize)] +pub struct DisambiguatorPaths { + disambiguator_xml: PathBuf, + disambiguator_options: PathBuf, +} + +impl BuildComponent for Disambiguator { + type Paths = DisambiguatorPaths; + + fn build(paths: DisambiguatorPaths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: DisambiguatorLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.disambiguator_options)?))?; + let rules = structure::parse::read_disambiguation_rules(paths.disambiguator_xml); + + let mut error = None; + + let rules: Vec<_> = rules + .into_iter() + .filter_map(|x| match x { + Ok((rule_structure, group, _)) => { + let id = Category::new("DISAMBIGUATION"); + + let id = if let Some(group) = &group { + id.join(group.id.as_str()).join(group.n) + } else { + id.join( + rule_structure + .id + .as_ref() + .expect("ID must be set if not in group."), + ) + .join(0) + }; + + match DisambiguationRule::from_rule_structure(rule_structure, build_info) { + Ok(mut rule) => { + if error.is_none() + && (options.ids.is_empty() + || options.ids.iter().any(|x| x.is_match(&id))) + && !options.ignore_ids.iter().any(|x| x.is_match(&id)) + { + rule.id = id; + + Some(rule) + } else { + None + } + } + Err(x) => { + if error.is_none() { + error = Some(format!("[Rule] {}", x)); + } + None + } + } + } + Err(x) => { + if error.is_none() { + error = Some(format!("[Structure] {}", x)); + } + None + } + }) + .collect(); + + if let Some(x) = error { + if options.allow_errors { + warn!("Error constructing Disambiguator: {}", x) + } else { + return Err(Error::Unexpected(format!( + "Error constructing Disambiguator: {}", + x + ))); + } + } + + Ok(Disambiguator { + rules, + properties: Default::default(), + }) + } +} + +/// Language-dependent options for a rule set. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct RulesLangOptions { + /// Whether to allow errors while constructing the rules. + pub allow_errors: bool, + /// Grammar Rule selectors to use in this set. + #[serde(default)] + pub ids: Vec, + /// Grammar Rule selectors to ignore in this set. + #[serde(default)] + pub ignore_ids: Vec, +} + +#[derive(Deserialize)] +pub struct RulesPaths { + rules_xml: PathBuf, + rules_options: PathBuf, +} + +impl BuildComponent for Rules { + type Paths = RulesPaths; + + fn build(paths: RulesPaths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: RulesLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.rules_options)?))?; + let rules = structure::parse::read_rules(paths.rules_xml); + let mut errors: DefaultHashMap = DefaultHashMap::new(); + + let rules: Vec<_> = rules + .into_iter() + .filter_map(|x| match x { + Ok((rule_structure, group, category)) => { + let category = category.expect("grammar rules must have category"); + let id = Category::new(category.id.as_str()); + + let id = if let Some(group) = &group { + id.join(group.id.as_str()).join(group.n) + } else { + id.join( + rule_structure + .id + .as_ref() + .expect("ID must be set if not in group."), + ) + .join(0) + }; + + let rule_on = match rule_structure.default.as_deref() { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let category_on = match category.default.as_deref() { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let name = rule_structure.name.as_ref().map_or_else( + || { + let group = group.as_ref().expect("must have group if name not set"); + group.name.clone() + }, + |x| x.clone(), + ); + + match Rule::from_rule_structure(rule_structure, build_info) { + Ok(mut rule) => { + if (options.ids.is_empty() + || options.ids.iter().any(|x| x.is_match(&id))) + && !options.ignore_ids.iter().any(|x| x.is_match(&id)) + { + rule.id = id; + rule.name = name; + rule.category_name = category.name; + rule.category_type = category.kind; + rule.enabled = category_on && group_on && rule_on; + Some(rule) + } else { + None + } + } + Err(x) => { + *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1; + None + } + } + } + Err(x) => { + *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1; + None + } + }) + .collect(); + + if !errors.is_empty() { + let mut errors: Vec<(String, usize)> = errors.into_iter().collect(); + errors.sort_by_key(|x| -(x.1 as i32)); + + warn!( + "Errors constructing Rules: {:#?}", + &errors + .iter() + .map(|(message, number)| format!("{} (n={})", message, number)) + .collect::>() + ); + } + + Ok(Rules { + rules, + properties: Default::default(), + }) + } +} diff --git a/nlprule/src/components/rules/compile/structure/impls.rs b/nlprule/src/components/rules/compile/structure/impls.rs new file mode 100644 index 0000000..27920df --- /dev/null +++ b/nlprule/src/components/rules/compile/structure/impls.rs @@ -0,0 +1,375 @@ +use std::{ + collections::hash_map::DefaultHasher, + hash::{Hash, Hasher}, +}; + +use crate::utils::parallelism::MaybeParallelIterator; +use crate::{ + compile::{BuildInfo, Error}, + rule::engine::{composition::*, Engine}, + utils::regex::Regex, +}; +use crate::{rule::disambiguation::PosFilter, types::*}; + +impl TextMatcher { + pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result { + // can not cache a matcher that depends on the graph + let set = if matcher.graph_id().is_some() { + None + } else if let either::Right(regex) = &matcher.matcher { + let mut hasher = DefaultHasher::default(); + regex.hash(&mut hasher); + matcher.negate.hash(&mut hasher); + matcher.empty_always_false.hash(&mut hasher); + let matcher_hash = hasher.finish(); + + if let Some(set) = info.mut_regex_cache().get(&matcher_hash) { + set.clone() + } else { + let data: Vec<_> = info.tagger().word_store().iter().collect(); + + let set: DefaultHashSet<_> = data + .into_maybe_par_iter() + .filter_map(|(word, id)| { + if matcher.is_match(word.as_str(), None, None) { + Some(*id) + } else { + None + } + }) + .collect(); + + // there are some regexes which match lots of strings + // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up + // the vast majority of regexes matches less than 100 strings from manual inspection + let set = if set.len() > 100 { None } else { Some(set) }; + info.mut_regex_cache().insert(matcher_hash, set.clone()); + set + } + } else { + None + }; + + Ok(TextMatcher { matcher, set }) + } +} + +impl PosMatcher { + pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result { + let mut mask = vec![false; info.tagger().tag_store().len()]; + + for (word, id) in info.tagger().tag_store().iter() { + mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None); + } + + Ok(PosMatcher { mask }) + } +} + +impl PosFilter { + pub fn new(matcher: PosMatcher) -> Self { + PosFilter { matcher } + } +} + +impl Regex { + pub fn from_java_regex( + java_regex_str: &str, + full_match: bool, + case_sensitive: bool, + ) -> Result { + let regex_string = + crate::compile::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?; + + let regex = Regex::new(regex_string); + if let Err(error) = regex.try_compile() { + return Err(Error::Regex(error)); + } + + Ok(regex) + } +} + +impl Engine { + pub fn to_graph_id(&self, id: usize) -> Result { + let mut id = GraphId(id); + + let map = match &self { + Engine::Token(engine) => &engine.composition.id_to_idx, + Engine::Text(_, id_to_idx) => &id_to_idx, + }; + + let max_id = *map + .keys() + .max() + .ok_or_else(|| Error::Unexpected("graph is empty".into()))?; + + // ideally this should throw an error but LT is more lenient than nlprule + if !map.contains_key(&id) { + id = max_id; + } + + Ok(id) + } +} + +mod composition { + use super::*; + use crate::{ + rule::engine::composition::{ + AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part, + Quantifier, TrueAtom, + }, + utils::regex::Regex, + }; + + impl Atom { + fn iter_mut<'a>(&'a mut self) -> Box + 'a> { + match self { + Atom::ChunkAtom(_) + | Atom::SpaceBeforeAtom(_) + | Atom::TextAtom(_) + | Atom::WordDataAtom(_) + | Atom::FalseAtom(_) + | Atom::TrueAtom(_) => Box::new(std::iter::once(self)), + Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()), + Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()), + Atom::NotAtom(x) => x.atom.iter_mut(), + Atom::OffsetAtom(x) => x.atom.iter_mut(), + } + } + + pub fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> { + let mut ids = Vec::new(); + + for atom in self.iter_mut() { + let id = match atom { + Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(), + Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(), + Atom::WordDataAtom(atom) => atom + .matcher + .inflect_matcher + .as_mut() + .and_then(|x| x.matcher.mut_graph_id()), + _ => { + continue; + } + }; + + if let Some(id) = id { + ids.push(id); + } + } + + ids + } + } + + impl Matcher { + pub fn new_regex(regex: Regex, negate: bool, empty_always_false: bool) -> Self { + Matcher { + matcher: either::Right(regex), + negate, + case_sensitive: true, // handled by regex, should maybe be an option + empty_always_false, + } + } + + pub fn new_string( + string_or_idx: either::Either, + negate: bool, + case_sensitive: bool, + empty_always_false: bool, + ) -> Self { + Matcher { + matcher: either::Left(string_or_idx), + negate, + case_sensitive, + empty_always_false, + } + } + + pub fn graph_id(&self) -> Option { + if let either::Left(either::Right(id)) = &self.matcher { + Some(*id) + } else { + None + } + } + + pub fn mut_graph_id(&mut self) -> Option<&mut GraphId> { + if let either::Left(either::Right(id)) = &mut self.matcher { + Some(id) + } else { + None + } + } + } + + impl Quantifier { + pub fn new(min: usize, max: usize) -> Self { + assert!(max >= min); + Quantifier { min, max } + } + } + + impl AndAtom { + pub fn and(atoms: Vec) -> Atom { + let mut atoms: Vec<_> = atoms + .into_iter() + .filter(|x| !matches!(x, Atom::TrueAtom { .. })) + .collect(); + + if atoms.is_empty() { + (TrueAtom {}).into() + } else if atoms.len() == 1 { + atoms.remove(0) + } else { + (AndAtom { atoms }).into() + } + } + } + + impl OrAtom { + pub fn or(atoms: Vec) -> Atom { + let mut atoms: Vec<_> = atoms + .into_iter() + .filter(|x| !matches!(x, Atom::FalseAtom { .. })) + .collect(); + + if atoms.is_empty() { + (FalseAtom {}).into() + } else if atoms.len() == 1 { + atoms.remove(0) + } else { + (OrAtom { atoms }).into() + } + } + } + + impl NotAtom { + pub fn not(atom: Atom) -> Atom { + match atom { + Atom::TrueAtom { .. } => FalseAtom::default().into(), + Atom::FalseAtom { .. } => TrueAtom::default().into(), + x => (NotAtom { atom: Box::new(x) }).into(), + } + } + } + + impl OffsetAtom { + pub fn new(atom: Atom, offset: isize) -> Self { + OffsetAtom { + atom: Box::new(atom), + offset, + } + } + } + + impl Composition { + pub fn new(mut parts: Vec) -> Result { + let mut id_to_idx = DefaultHashMap::default(); + id_to_idx.insert(GraphId(0), 0); + let mut current_id = 1; + + for (i, part) in parts.iter().enumerate() { + if part.visible { + id_to_idx.insert(GraphId(current_id), i + 1); + current_id += 1; + } + } + + let can_stop_mask = (0..parts.len()) + .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0)) + .collect(); + + for (i, part) in parts.iter_mut().enumerate() { + for id in part.atom.mut_graph_ids() { + loop { + let index = *id_to_idx.get(&id).ok_or_else(|| { + Error::Unexpected(format!("id must exist in graph: {:?}", id)) + })?; + + // ideally this should throw an error but LT is more lenient than nlprule + if index > i { + *id = GraphId(id.0 - 1); + } else { + break; + } + } + } + } + + Ok(Composition { + parts, + id_to_idx, + can_stop_mask, + }) + } + } +} + +pub mod filters { + use super::Error; + use std::collections::HashMap; + + use crate::{filter::*, rule::engine::Engine, utils::regex::Regex}; + + trait FromArgs: Sized { + fn from_args(args: HashMap, engine: &Engine) -> Result; + } + + impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter { + fn from_args(args: HashMap, engine: &Engine) -> Result { + if args.contains_key("negate_postag") { + panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter"); + } + + Ok(NoDisambiguationEnglishPartialPosTagFilter { + id: engine.to_graph_id(args + .get("no") + .ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument" + .into(), + ) + })? + .parse::()?)?, + regexp: Regex::from_java_regex( + &args.get("regexp").ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument" + .into(), + ) + })?, + true, + true, + )?, + postag_regexp: Regex::from_java_regex( + &args.get("postag_regexp").ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument" + .into(), + ) + })?, + true, + true, + )?, + negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"), + }) + } + } + + pub fn get_filter( + name: &str, + args: HashMap, + engine: &Engine, + ) -> Result { + match name { + "NoDisambiguationEnglishPartialPosTagFilter" => { + Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into()) + } + _ => Err(Error::Unexpected(format!("unsupported filter {}", name))), + } + } +} diff --git a/nlprule/src/compile/structure.rs b/nlprule/src/components/rules/compile/structure/mod.rs similarity index 79% rename from nlprule/src/compile/structure.rs rename to nlprule/src/components/rules/compile/structure/mod.rs index eb38b43..883b9fc 100644 --- a/nlprule/src/compile/structure.rs +++ b/nlprule/src/components/rules/compile/structure/mod.rs @@ -1,7 +1,7 @@ -use fs_err::File; use serde::Deserialize; -use std::io::BufReader; -use xml::reader::EventReader; + +pub mod impls; +pub mod parse; mod preprocess { use std::{borrow::Cow, str::FromStr}; @@ -639,154 +639,3 @@ pub enum DisambiguationRuleContainer { RuleGroup(DisambiguationRuleGroup), Unification(Unification), } - -macro_rules! flatten_group { - ($rulegroup:expr, $category:expr) => {{ - let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns { - antipatterns - } else { - Vec::new() - }; - - let group = Group { - id: $rulegroup.id, - default: $rulegroup.default, - name: $rulegroup.name, - n: 0, - }; - - $rulegroup - .rules - .into_iter() - .enumerate() - .map(|(i, mut rule)| { - if let Some(antipatterns) = &mut rule.antipatterns { - antipatterns.extend(group_antipatterns.clone()); - } else { - rule.antipatterns = Some(group_antipatterns.clone()); - } - - let mut group = group.clone(); - group.n = i; - (rule, Some(group), $category.clone()) - }) - .collect::>() - }}; -} - -type GrammarRuleReading = (Rule, Option, Option); -type DisambiguationRuleReading = (DisambiguationRule, Option, Option); - -pub fn read_rules>( - path: P, -) -> Vec> { - let file = File::open(path.as_ref()).unwrap(); - let file = BufReader::new(file); - - let sanitized = preprocess::sanitize(file, &["suggestion"]); - let rules = preprocess::extract_rules(sanitized.as_bytes()); - - let mut unifications = Vec::new(); - - let rules: Vec<_> = rules - .into_iter() - .map(|(xml, category)| { - let mut out = Vec::new(); - - let deseralized = RuleContainer::deserialize(&mut serde_xml_rs::Deserializer::new( - EventReader::new(xml.as_bytes()), - )); - - out.extend(match deseralized { - Ok(rule_container) => match rule_container { - RuleContainer::Rule(rule) => { - vec![Ok((rule, None, category))] - } - RuleContainer::RuleGroup(rule_group) => flatten_group!(rule_group, category) - .into_iter() - .map(Ok) - .collect(), - RuleContainer::Unification(unification) => { - unifications.push(unification); - - vec![] - } - }, - Err(err) => vec![Err(err)], - }); - out - }) - .flatten() - .collect(); - - rules - .into_iter() - .map(|result| match result { - Ok(mut x) => { - x.0.unifications = Some(unifications.clone()); - - Ok(x) - } - Err(x) => Err(x), - }) - .collect() -} - -pub fn read_disambiguation_rules>( - path: P, -) -> Vec> { - let file = File::open(path.as_ref()).unwrap(); - let file = BufReader::new(file); - - let sanitized = preprocess::sanitize(file, &[]); - let rules = preprocess::extract_rules(sanitized.as_bytes()); - - let mut unifications = Vec::new(); - - let rules: Vec<_> = rules - .into_iter() - .map(|(xml, _)| { - let mut out = Vec::new(); - - let deseralized = DisambiguationRuleContainer::deserialize( - &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), - ); - - let category: Option = None; - - out.extend(match deseralized { - Ok(rule_container) => match rule_container { - DisambiguationRuleContainer::Rule(rule) => { - vec![Ok((rule, None, category))] - } - DisambiguationRuleContainer::RuleGroup(rule_group) => { - flatten_group!(rule_group, category) - .into_iter() - .map(Ok) - .collect() - } - DisambiguationRuleContainer::Unification(unification) => { - unifications.push(unification); - - vec![] - } - }, - Err(err) => vec![Err(err)], - }); - out - }) - .flatten() - .collect(); - - rules - .into_iter() - .map(|result| match result { - Ok(mut x) => { - x.0.unifications = Some(unifications.clone()); - - Ok(x) - } - Err(x) => Err(x), - }) - .collect() -} diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/components/rules/compile/structure/parse.rs similarity index 72% rename from nlprule/src/compile/parse_structure.rs rename to nlprule/src/components/rules/compile/structure/parse.rs index 0be9924..f6449a6 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/components/rules/compile/structure/parse.rs @@ -1,12 +1,12 @@ -use std::{ops::Range, sync::Arc}; +use std::{io::BufReader, ops::Range}; -use super::{structure, Error}; -use crate::{tokenizer::tag::Tagger, types::*}; +use crate::compile::{BuildInfo, Error}; +use crate::types::*; use crate::{utils, utils::regex::Regex}; +use fs_err::File; use lazy_static::lazy_static; -use serde::{Deserialize, Serialize}; - -pub use structure::{read_disambiguation_rules, read_rules}; +use serde::Deserialize; +use serde_xml_rs::EventReader; use crate::rule::disambiguation::*; use crate::rule::engine::composition::concrete::*; @@ -15,64 +15,16 @@ use crate::rule::engine::*; use crate::rule::grammar::*; use crate::rule::{id::Index, DisambiguationRule, Rule, Unification}; +use super::Category; + // this is set arbitrarily at the moment, could be an option #[inline] fn max_matches() -> usize { 20 } -#[derive(Serialize, Deserialize, Debug)] -pub(crate) struct RegexCache { - cache: DefaultHashMap>>, - // this is compared with the hash of the word store of the tagger - word_hash: u64, -} - -impl RegexCache { - pub fn new(word_hash: u64) -> Self { - RegexCache { - cache: DefaultHashMap::default(), - word_hash, - } - } - - pub fn word_hash(&self) -> &u64 { - &self.word_hash - } - - pub(crate) fn get(&self, key: &u64) -> Option<&Option>> { - self.cache.get(key) - } - - pub(crate) fn insert(&mut self, key: u64, value: Option>) { - self.cache.insert(key, value); - } -} - -pub(crate) struct BuildInfo { - tagger: Arc, - regex_cache: RegexCache, -} - -impl BuildInfo { - pub fn new(tagger: Arc, regex_cache: RegexCache) -> Self { - BuildInfo { - tagger, - regex_cache, - } - } - - pub fn tagger(&self) -> &Arc { - &self.tagger - } - - pub fn mut_regex_cache(&mut self) -> &mut RegexCache { - &mut self.regex_cache - } -} - fn parse_match_attribs( - attribs: impl structure::MatchAttributes, + attribs: impl super::MatchAttributes, text: Option<&str>, case_sensitive: bool, text_match_idx: Option, @@ -149,11 +101,11 @@ fn parse_match_attribs( }; if inflected { - inflect_matcher = Some(matcher); + inflect_matcher = Some(TextMatcher::new(matcher, info)?); } else { atoms.push( (TextAtom { - matcher: TextMatcher::new(matcher, info), + matcher: TextMatcher::new(matcher, info)?, }) .into(), ); @@ -172,13 +124,13 @@ fn parse_match_attribs( true, ) }; - pos_matcher = Some(PosMatcher::new(raw_matcher, info)); + pos_matcher = Some(PosMatcher::new(raw_matcher, info)?); } if pos_matcher.is_some() || inflect_matcher.is_some() { let matcher = WordDataMatcher { pos_matcher, - inflect_matcher: inflect_matcher.map(|x| TextMatcher::new(x, info)), + inflect_matcher, }; atoms.push( (WordDataAtom { @@ -234,7 +186,7 @@ fn parse_match_attribs( } fn get_exceptions( - token: &structure::Token, + token: &super::Token, case_sensitive: bool, only_shifted: bool, info: &mut BuildInfo, @@ -243,15 +195,11 @@ fn get_exceptions( let exceptions: Vec = parts .iter() .filter_map(|x| match x { - structure::TokenPart::Exception(x) => Some(x), + super::TokenPart::Exception(x) => Some(x), _ => None, }) .filter_map(|x| { - let exception_text = if let Some(exception_text) = &x.text { - Some(exception_text.as_str()) - } else { - None - }; + let exception_text = x.text.as_ref().map(|x| x.as_str()); let mut atom = match parse_match_attribs(x, exception_text, case_sensitive, None, info) { Ok(atom) => atom, @@ -287,14 +235,14 @@ fn get_exceptions( } fn parse_token( - token: &structure::Token, + token: &super::Token, case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { let mut parts = Vec::new(); let text = if let Some(parts) = &token.parts { parts.iter().find_map(|x| match x { - structure::TokenPart::Text(text) => Some(text.as_str()), + super::TokenPart::Text(text) => Some(text.as_str()), _ => None, }) } else { @@ -303,7 +251,7 @@ fn parse_token( let text_match_idx = if let Some(parts) = &token.parts { match parts.iter().find_map(|x| match x { - structure::TokenPart::Sub(sub) => Some(sub.no.parse::().map(|x| x + 1)), + super::TokenPart::Sub(sub) => Some(sub.no.parse::().map(|x| x + 1)), _ => None, }) { None => None, @@ -374,7 +322,7 @@ fn parse_token( Ok(parts) } -fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Result { +fn parse_match(m: super::Match, engine: &Engine, info: &mut BuildInfo) -> Result { if m.postag.is_some() || m.postag_regex.is_some() || m.postag_replace.is_some() @@ -396,11 +344,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re m.no.parse::() .expect("no must be parsable as usize."); - let case_conversion = if let Some(conversion) = &m.case_conversion { - Some(conversion.as_str()) - } else { - None - }; + let case_conversion = m.case_conversion.as_deref(); let pos_replacer = if let Some(postag) = m.postag { if postag.contains("+DT") || postag.contains("+INDT") { @@ -418,7 +362,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re x => panic!("unknown postag_regex value {:?}", x), }; Some(PosReplacer { - matcher: PosMatcher::new(matcher, info), + matcher: PosMatcher::new(matcher, info)?, }) } else { None @@ -495,17 +439,17 @@ fn parse_synthesizer_text(text: &str, engine: &Engine) -> Result Result { let mut parts = Vec::new(); for part in data.parts { match part { - structure::SuggestionPart::Text(text) => { + super::SuggestionPart::Text(text) => { parts.extend(parse_synthesizer_text(text.as_str(), engine)?); } - structure::SuggestionPart::Match(m) => { + super::SuggestionPart::Match(m) => { parts.push(SynthesizerPart::Match(parse_match(m, engine, info)?.into())); } } @@ -523,7 +467,7 @@ fn get_last_id(parts: &[Part]) -> isize { } fn parse_parallel_tokens( - tokens: &[structure::Token], + tokens: &[super::Token], case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { @@ -544,7 +488,7 @@ fn parse_parallel_tokens( } fn parse_tokens( - tokens: &[structure::TokenCombination], + tokens: &[super::TokenCombination], case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { @@ -552,8 +496,8 @@ fn parse_tokens( for token_combination in tokens { out.extend(match token_combination { - structure::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?, - structure::TokenCombination::And(tokens) => { + super::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?, + super::TokenCombination::And(tokens) => { let atom = AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); vec![Part { @@ -564,7 +508,7 @@ fn parse_tokens( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }] } - structure::TokenCombination::Or(tokens) => { + super::TokenCombination::Or(tokens) => { let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); vec![Part { atom, @@ -574,7 +518,7 @@ fn parse_tokens( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }] } - structure::TokenCombination::Feature(_) => Vec::new(), + super::TokenCombination::Feature(_) => Vec::new(), }); } @@ -582,7 +526,7 @@ fn parse_tokens( } fn parse_pattern( - pattern: structure::Pattern, + pattern: super::Pattern, info: &mut BuildInfo, ) -> Result<(Composition, usize, usize), Error> { let mut start = None; @@ -596,17 +540,17 @@ fn parse_pattern( for part in &pattern.parts { match part { - structure::PatternPart::Token(token) => { + super::PatternPart::Token(token) => { composition_parts.extend(parse_token(token, case_sensitive, info)?) } - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { start = Some(get_last_id(&composition_parts)); composition_parts.extend(parse_tokens(&marker.tokens, case_sensitive, info)?); end = Some(get_last_id(&composition_parts)); } - structure::PatternPart::And(tokens) => { + super::PatternPart::And(tokens) => { let atom = AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); @@ -618,7 +562,7 @@ fn parse_pattern( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }); } - structure::PatternPart::Or(tokens) => { + super::PatternPart::Or(tokens) => { let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); composition_parts.push(Part { @@ -629,7 +573,7 @@ fn parse_pattern( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }); } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -642,12 +586,12 @@ fn parse_pattern( } fn parse_features( - pattern: &structure::Pattern, - unifications: &Option>, + pattern: &super::Pattern, + unifications: &Option>, info: &mut BuildInfo, -) -> Vec> { +) -> Result>, Error> { let mut filters = Vec::new(); - let mut parse_feature = |id: &str| -> Vec { + let mut parse_feature = |id: &str| -> Result, Error> { let unification = unifications .as_ref() .unwrap() @@ -670,11 +614,11 @@ fn parse_features( for part in &pattern.parts { match part { - structure::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)), - structure::PatternPart::Marker(marker) => { + super::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)?), + super::PatternPart::Marker(marker) => { for token_combination in &marker.tokens { - if let structure::TokenCombination::Feature(feature) = token_combination { - filters.push(parse_feature(&feature.id)); + if let super::TokenCombination::Feature(feature) = token_combination { + filters.push(parse_feature(&feature.id)?); } } } @@ -682,14 +626,11 @@ fn parse_features( } } - filters + Ok(filters) } impl Rule { - pub(crate) fn from_rule_structure( - data: structure::Rule, - info: &mut BuildInfo, - ) -> Result { + pub fn from_rule_structure(data: super::Rule, info: &mut BuildInfo) -> Result { if data.filter.is_some() { return Err(Error::Unimplemented( "rules with filter are not implemented.".into(), @@ -756,7 +697,7 @@ impl Rule { }; let unify_data = if let Some(pattern) = &data.pattern { - let unify_filters = parse_features(&pattern, &data.unifications, info); + let unify_filters = parse_features(&pattern, &data.unifications, info)?; let unify_mask: Vec<_> = maybe_composition .unwrap() .parts @@ -773,16 +714,16 @@ impl Rule { for part in data.message.parts { match part { - structure::MessagePart::Suggestion(suggestion) => { + super::MessagePart::Suggestion(suggestion) => { let suggester = parse_suggestion(suggestion.clone(), &engine, info)?; // simpler to just parse a second time than cloning the result message_parts.extend(parse_suggestion(suggestion, &engine, info)?.parts); suggesters.push(suggester); } - structure::MessagePart::Text(text) => { + super::MessagePart::Text(text) => { message_parts.extend(parse_synthesizer_text(text.as_str(), &engine)?); } - structure::MessagePart::Match(m) => { + super::MessagePart::Match(m) => { message_parts.push(SynthesizerPart::Match( parse_match(m, &engine, info)?.into(), )); @@ -817,10 +758,10 @@ impl Rule { for part in &example.parts { match part { - structure::ExamplePart::Text(text) => { + super::ExamplePart::Text(text) => { texts.push(text.as_str()); } - structure::ExamplePart::Marker(marker) => { + super::ExamplePart::Marker(marker) => { let (bytes_before, chars_before) = texts.iter().fold((0, 0), |acc, text| { (acc.0 + text.len(), acc.1 + text.chars().count()) @@ -911,6 +852,8 @@ fn parse_tag_form( is_sentence_end: bool, info: &mut BuildInfo, ) -> Result, Error> { + let tagger = info.tagger(); + lazy_static! { static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into()); } @@ -922,7 +865,7 @@ fn parse_tag_form( let text = captures.get(1).expect("1st regex group exists").as_str(); let tags = captures.get(2).expect("2nd regex group exists").as_str(); - let mut tag_vec: Vec<_> = tags + let mut tags: Vec<_> = tags .split(',') .filter_map(|x| { if x == "" { @@ -935,44 +878,46 @@ fn parse_tag_form( None } else { Some(WordData::new( - info.tagger.id_word(parts[0].to_owned().into()), - info.tagger.id_tag(parts[1]).into_static(), + tagger.id_word(parts[0].to_owned().into()), + tagger.id_tag(parts[1]).into_static(), )) } }) .collect(); - tag_vec.push( + tags.push( WordData::new( - info.tagger.id_word(text.to_owned().into()), + tagger.id_word(text.to_owned().into()), PosId::special(SpecialPos::None), ) .freeze(), ); if is_sentence_end { - tag_vec.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); + tags.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); } - let tags = Tags::new(tag_vec); - - Ok(tags) + Ok(Tags::new(WordId::empty(), tags)) } impl WordData<'static> { - fn from_structure(data: structure::WordData, info: &mut BuildInfo) -> Self { - WordData::new( - info.tagger + fn from_structure(data: super::WordData, info: &mut BuildInfo) -> Result { + Ok(WordData::new( + info.tagger() .id_word(data.lemma.unwrap_or_else(String::new).into()), - info.tagger + info.tagger() .id_tag(data.pos.as_deref().unwrap_or("").trim()) .into_static(), - ) + )) } } -fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter { - match postag_regexp.as_deref() { +fn parse_pos_filter( + postag: &str, + postag_regexp: Option<&str>, + info: &mut BuildInfo, +) -> Result { + Ok(match postag_regexp.as_deref() { Some("yes") => PosFilter::new(PosMatcher::new( Matcher::new_regex( Regex::from_java_regex(&postag, true, true).unwrap(), @@ -980,23 +925,23 @@ fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildI true, ), info, - )), + )?), Some(_) | None => PosFilter::new(PosMatcher::new( Matcher::new_string(either::Left(postag.into()), false, false, true), info, - )), - } + )?), + }) } impl DisambiguationRule { - pub(crate) fn from_rule_structure( - data: structure::DisambiguationRule, + pub fn from_rule_structure( + data: super::DisambiguationRule, info: &mut BuildInfo, ) -> Result { // might need the pattern later so clone it here let (composition, start, end) = parse_pattern(data.pattern.clone(), info)?; - let unify_filters = parse_features(&data.pattern, &data.unifications, info); + let unify_filters = parse_features(&data.pattern, &data.unifications, info)?; let unify_mask: Vec<_> = composition.parts.iter().map(|part| part.unify).collect(); let antipatterns = if let Some(antipatterns) = data.antipatterns { @@ -1025,25 +970,25 @@ impl DisambiguationRule { let word_datas: Vec<_> = if let Some(wds) = data.disambig.word_datas { wds.into_iter() .map(|part| match part { - structure::DisambiguationPart::WordData(x) => { - either::Left(WordData::from_structure(x, info)) + super::DisambiguationPart::WordData(x) => { + WordData::from_structure(x, info).map(either::Left) + } + super::DisambiguationPart::Match(x) => { + parse_pos_filter(&x.postag.unwrap(), x.postag_regexp.as_deref(), info) + .map(either::Right) } - structure::DisambiguationPart::Match(x) => either::Right(parse_pos_filter( - &x.postag.unwrap(), - x.postag_regexp.as_deref(), - info, - )), }) - .collect() + .collect::>()? } else { Vec::new() }; + let tagger = info.tagger(); let disambiguations = match data.disambig.action.as_deref() { Some("remove") => { if let Some(postag) = data.disambig.postag.as_ref() { Ok(Disambiguation::Remove(vec![either::Right( - parse_pos_filter(postag, Some("yes"), info), + parse_pos_filter(postag, Some("yes"), info)?, )])) } else { Ok(Disambiguation::Remove(word_datas.into_iter().collect())) @@ -1081,45 +1026,59 @@ impl DisambiguationRule { for part in &data.pattern.parts { match part { - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { has_marker = true; for token in &marker.tokens { let token = match token { - structure::TokenCombination::Token(token) => token, - structure::TokenCombination::And(tokens) - | structure::TokenCombination::Or(tokens) => &tokens.tokens[0], - structure::TokenCombination::Feature(_) => continue, + super::TokenCombination::Token(token) => token, + super::TokenCombination::And(tokens) + | super::TokenCombination::Or(tokens) => &tokens.tokens[0], + super::TokenCombination::Feature(_) => continue, }; - marker_disambig.push(token.postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - token.postag_regexp.as_deref(), - info, - )) - })); + marker_disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + token.postag_regexp.as_deref(), + info, + ) + .map(either::Right) + }) + .transpose()?, + ); } } - structure::PatternPart::Token(token) => { - disambig.push(token.postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - token.postag_regexp.as_deref(), - info, - )) - })) - } - structure::PatternPart::And(tokens) - | structure::PatternPart::Or(tokens) => { - disambig.push(tokens.tokens[0].postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - tokens.tokens[0].postag_regexp.as_deref(), - info, - )) - })) + super::PatternPart::Token(token) => disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter(x, token.postag_regexp.as_deref(), info) + .map(either::Right) + }) + .transpose()?, + ), + super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => { + disambig.push( + tokens.tokens[0] + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + tokens.tokens[0].postag_regexp.as_deref(), + info, + ) + .map(either::Right) + }) + .transpose()?, + ) } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -1131,7 +1090,7 @@ impl DisambiguationRule { Ok(Disambiguation::Filter( disambiguations.into_iter().collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } Some("filter") => { @@ -1141,13 +1100,13 @@ impl DisambiguationRule { postag, Some("yes"), info, - )))], - info.tagger().lang_options().retain_last, + )?))], + tagger.lang_options().retain_last, )) } else { Ok(Disambiguation::Filter( word_datas.into_iter().map(Some).collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } } @@ -1161,36 +1120,61 @@ impl DisambiguationRule { for part in &data.pattern.parts { match part { - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { has_marker = true; for token in &marker.tokens { let token = match token { - structure::TokenCombination::Token(token) => token, - structure::TokenCombination::And(tokens) - | structure::TokenCombination::Or(tokens) => &tokens.tokens[0], - structure::TokenCombination::Feature(_) => continue, + super::TokenCombination::Token(token) => token, + super::TokenCombination::And(tokens) + | super::TokenCombination::Or(tokens) => &tokens.tokens[0], + super::TokenCombination::Feature(_) => continue, }; - marker_disambig.push(token.postag.as_ref().map(|x| { - parse_pos_filter(x, token.postag_regexp.as_deref(), info) - })); + marker_disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + token.postag_regexp.as_deref(), + info, + ) + }) + .transpose()?, + ); marker_mask.push(token.unify.is_some()) } } - structure::PatternPart::Token(token) => { - disambig.push(token.postag.as_ref().map(|x| { - parse_pos_filter(x, token.postag_regexp.as_deref(), info) - })); + super::PatternPart::Token(token) => { + disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter(x, token.postag_regexp.as_deref(), info) + }) + .transpose()?, + ); mask.push(token.unify.is_some()); } - structure::PatternPart::And(tokens) - | structure::PatternPart::Or(tokens) => { - disambig.push(tokens.tokens[0].postag.as_ref().map(|x| { - parse_pos_filter(x, tokens.tokens[0].postag_regexp.as_deref(), info) - })); + super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => { + disambig.push( + tokens.tokens[0] + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + tokens.tokens[0].postag_regexp.as_deref(), + info, + ) + }) + .transpose()?, + ); mask.push(tokens.tokens[0].unify.is_some()); } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -1206,15 +1190,15 @@ impl DisambiguationRule { if let Some(postag) = data.disambig.postag.as_ref() { Ok(Disambiguation::Filter( vec![Some(either::Left(WordData::new( - info.tagger.id_word("".into()), - info.tagger.id_tag(postag).into_static(), + tagger.id_word("".into()), + tagger.id_tag(postag).into_static(), )))], - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } else { Ok(Disambiguation::Filter( word_datas.into_iter().map(Some).collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } } @@ -1253,11 +1237,11 @@ impl DisambiguationRule { for part in &example.parts { match part { - structure::ExamplePart::Text(text) => { + super::ExamplePart::Text(text) => { texts.push(text.as_str()); char_length += text.chars().count(); } - structure::ExamplePart::Marker(marker) => { + super::ExamplePart::Marker(marker) => { if char_span.is_some() { return Err(Error::Unexpected( "example must have one or zero markers".into(), @@ -1330,3 +1314,160 @@ impl DisambiguationRule { }) } } + +macro_rules! flatten_group { + ($rulegroup:expr, $category:expr) => {{ + let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns { + antipatterns + } else { + Vec::new() + }; + + let group = super::Group { + id: $rulegroup.id, + default: $rulegroup.default, + name: $rulegroup.name, + n: 0, + }; + + $rulegroup + .rules + .into_iter() + .enumerate() + .map(|(i, mut rule)| { + if let Some(antipatterns) = &mut rule.antipatterns { + antipatterns.extend(group_antipatterns.clone()); + } else { + rule.antipatterns = Some(group_antipatterns.clone()); + } + + let mut group = group.clone(); + group.n = i; + (rule, Some(group), $category.clone()) + }) + .collect::>() + }}; +} + +type GrammarRuleReading = (super::Rule, Option, Option); +type DisambiguationRuleReading = ( + super::DisambiguationRule, + Option, + Option, +); + +pub fn read_rules>( + path: P, +) -> Vec> { + let file = File::open(path.as_ref()).unwrap(); + let file = BufReader::new(file); + + let sanitized = super::preprocess::sanitize(file, &["suggestion"]); + let rules = super::preprocess::extract_rules(sanitized.as_bytes()); + + let mut unifications = Vec::new(); + + let rules: Vec<_> = rules + .into_iter() + .map(|(xml, category)| { + let mut out = Vec::new(); + + let deseralized = super::RuleContainer::deserialize( + &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), + ); + + out.extend(match deseralized { + Ok(rule_container) => match rule_container { + super::RuleContainer::Rule(rule) => { + vec![Ok((rule, None, category))] + } + super::RuleContainer::RuleGroup(rule_group) => { + flatten_group!(rule_group, category) + .into_iter() + .map(Ok) + .collect() + } + super::RuleContainer::Unification(unification) => { + unifications.push(unification); + + vec![] + } + }, + Err(err) => vec![Err(err)], + }); + out + }) + .flatten() + .collect(); + + rules + .into_iter() + .map(|result| match result { + Ok(mut x) => { + x.0.unifications = Some(unifications.clone()); + + Ok(x) + } + Err(x) => Err(x), + }) + .collect() +} + +pub fn read_disambiguation_rules>( + path: P, +) -> Vec> { + let file = File::open(path.as_ref()).unwrap(); + let file = BufReader::new(file); + + let sanitized = super::preprocess::sanitize(file, &[]); + let rules = super::preprocess::extract_rules(sanitized.as_bytes()); + + let mut unifications = Vec::new(); + + let rules: Vec<_> = rules + .into_iter() + .map(|(xml, _)| { + let mut out = Vec::new(); + + let deseralized = super::DisambiguationRuleContainer::deserialize( + &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), + ); + + let category: Option = None; + + out.extend(match deseralized { + Ok(rule_container) => match rule_container { + super::DisambiguationRuleContainer::Rule(rule) => { + vec![Ok((rule, None, category))] + } + super::DisambiguationRuleContainer::RuleGroup(rule_group) => { + flatten_group!(rule_group, category) + .into_iter() + .map(Ok) + .collect() + } + super::DisambiguationRuleContainer::Unification(unification) => { + unifications.push(unification); + + vec![] + } + }, + Err(err) => vec![Err(err)], + }); + out + }) + .flatten() + .collect(); + + rules + .into_iter() + .map(|result| match result { + Ok(mut x) => { + x.0.unifications = Some(unifications.clone()); + + Ok(x) + } + Err(x) => Err(x), + }) + .collect() +} diff --git a/nlprule/src/components/rules/mod.rs b/nlprule/src/components/rules/mod.rs new file mode 100644 index 0000000..78be9ea --- /dev/null +++ b/nlprule/src/components/rules/mod.rs @@ -0,0 +1,338 @@ +use log::info; +use serde::{Deserialize, Serialize}; +use std::iter::FromIterator; + +use crate::properties::*; +use crate::rule::Rule; +use crate::types::*; +use crate::utils::parallelism::MaybeParallelRefIterator; +use crate::{ + properties::Transform, + rule::{ + id::{Index, Selector}, + DisambiguationRule, MatchSentence, + }, + types::Sentence, +}; +use once_cell::sync::OnceCell; + +use super::Component; + +#[cfg(feature = "compile")] +mod compile; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct Disambiguator { + rules: Vec, + #[serde(skip)] + properties: OnceCell, +} + +impl Transform for Disambiguator { + fn properties(&self) -> PropertiesMut { + *self.properties.get_or_init(|| { + self.rules + .iter() + .map(|rule| rule.compute_properties()) + .collect() + }) + } + + fn transform<'t>( + &'t self, + sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + self.disambiguate_up_to_id(sentence, None) + } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut current_rules: Vec<&DisambiguationRule> = Vec::new(); + let mut passes = 0; + + for rule in self.rules() { + let pipeline = tokenize::Pipeline::new(( + &tokenizer, + current_rules + .iter() + .map(|x| (*x).clone()) + .collect::(), + ))?; + + if rule.test(&pipeline).is_ok() { + passes += 1; + } + + current_rules.push(rule); + } + + info!( + "{0} out of {1} Disambiguation Rule tests passed.", + passes, + self.rules.len() + ); + + if passes == self.rules().len() { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } +} + +impl Component for Disambiguator { + fn name() -> &'static str { + "disambiguator" + } +} + +impl Disambiguator { + /// Gets all disambigation rules in the order they are applied. + pub fn rules(&self) -> &[DisambiguationRule] { + &self.rules + } + + pub(crate) fn disambiguate_up_to_id<'t>( + &'t self, + mut sentence: Sentence<'t>, + id: Option<&Index>, + ) -> Result, crate::properties::Error> { + let n = id.map_or(self.rules.len(), |id| { + self.rules.iter().position(|x| x.id == *id).unwrap() + }); + let mut i = 0; + + let guard = self.property_guard(&mut sentence)?; + + while i < n { + let match_sentence = MatchSentence::new(&sentence, guard.downgrade()); + + let result = self.rules[i..n] + .maybe_par_iter() + .enumerate() + .filter_map(|(j, rule)| { + let changes = rule.apply(&match_sentence); + + match changes { + Ok(changes) => { + if changes.is_empty() { + None + } else { + Some(Ok((j + i, changes))) + } + } + Err(err) => Some(Err(err)), + } + }) + .find_first(|_| true) + .transpose()?; + + if let Some((index, changes)) = result { + self.rules[index].change(&mut sentence, changes, guard)?; + i = index + 1; + } else { + i = n; + } + } + + Ok(sentence) + } +} + +/// A set of grammatical error correction rules. +#[derive(Serialize, Deserialize, Default, Clone)] +pub struct Rules { + rules: Vec, + #[serde(skip)] + properties: OnceCell, +} + +impl Component for Rules { + fn name() -> &'static str { + "rules" + } +} + +impl Suggest for Rules { + fn properties(&self) -> Properties { + *self.properties.get_or_init(|| { + self.rules + .iter() + .map(|rule| rule.compute_properties()) + .collect() + }) + } + + fn suggest(&self, sentence: &Sentence) -> Result, crate::properties::Error> { + let sentence = MatchSentence::new(sentence, self.property_guard(sentence)?); + + let mut output: Vec<(usize, Suggestion)> = self + .rules + .maybe_par_iter() + .enumerate() + .filter(|(_, rule)| rule.enabled()) + .map(|(i, rule)| { + let mut output = Vec::new(); + + for suggestion in rule.apply(&sentence) { + match suggestion { + Ok(suggestion) => output.push((i, suggestion)), + Err(err) => return Err(err), + } + } + + Ok(output) + }) + .collect::>, crate::properties::Error>>()? + .into_iter() + .flatten() + .collect(); + + output.sort_by(|(ia, a), (ib, b)| { + a.span() + .char() + .start + .cmp(&b.span().char().start) + .then_with(|| ib.cmp(ia)) + }); + + let mut mask = vec![false; sentence.text().chars().count()]; + + Ok(output + .into_iter() + .filter_map(|(_, suggestion)| { + let span = suggestion.span().clone().lshift(sentence.span().start()); + + if mask[span.char().clone()].iter().all(|x| !x) { + mask[span.char().clone()].iter_mut().for_each(|x| *x = true); + Some(suggestion) + } else { + None + } + }) + .collect()) + } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut passes = 0; + + for rule in self.rules() { + if rule.test(&tokenizer).is_ok() { + passes += 1; + }; + } + + info!( + "{0} out of {1} Grammar Rule tests passed.", + passes, + self.rules.len() + ); + + if passes == self.rules().len() { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } +} + +impl Rules { + /// All rules ordered by priority. + pub fn rules(&self) -> &[Rule] { + &self.rules + } + + /// All rules ordered by priority (mutable). + pub fn rules_mut(&mut self) -> &mut [Rule] { + &mut self.rules + } + + /// Returns an iterator over all rules matching the selector. + pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> { + RulesIter { + inner: self.rules.iter(), + selector: Some(selector), + } + } + + /// Returns an iterator over all rules matching the selector (mutable). + pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> { + RulesIterMut { + inner: self.rules.iter_mut(), + selector: Some(selector), + } + } +} + +/// An iterator over references to rules. +pub struct RulesIter<'a> { + selector: Option<&'a Selector>, + inner: std::slice::Iter<'a, Rule>, +} + +impl<'a> Iterator for RulesIter<'a> { + type Item = &'a Rule; + fn next(&mut self) -> Option { + let selector = self.selector.as_ref(); + + self.inner + .find(|rule| selector.map_or(true, |s| s.is_match(rule.id()))) + } +} + +/// An iterator over mutable references to rules. +pub struct RulesIterMut<'a> { + selector: Option<&'a Selector>, + inner: std::slice::IterMut<'a, Rule>, +} + +impl<'a> Iterator for RulesIterMut<'a> { + type Item = &'a mut Rule; + fn next(&mut self) -> Option { + let selector = self.selector.as_ref(); + + self.inner + .find(|rule| selector.map_or(true, |s| s.is_match(rule.id()))) + } +} + +impl IntoIterator for Rules { + type Item = Rule; + type IntoIter = std::vec::IntoIter; + fn into_iter(self) -> Self::IntoIter { + self.rules.into_iter() + } +} + +impl FromIterator for Rules +where + R: Into, +{ + fn from_iter>(iter: I) -> Self { + let rules: Vec = iter.into_iter().map(|x| x.into()).collect(); + Self { + rules, + properties: OnceCell::default(), + } + } +} + +impl IntoIterator for Disambiguator { + type Item = DisambiguationRule; + type IntoIter = std::vec::IntoIter; + fn into_iter(self) -> Self::IntoIter { + self.rules.into_iter() + } +} + +impl FromIterator for Disambiguator +where + R: Into, +{ + fn from_iter>(iter: I) -> Self { + let rules: Vec = iter.into_iter().map(|x| x.into()).collect(); + Self { + rules, + properties: OnceCell::default(), + } + } +} diff --git a/nlprule/src/components/tagger/compile.rs b/nlprule/src/components/tagger/compile.rs new file mode 100644 index 0000000..0e4dd94 --- /dev/null +++ b/nlprule/src/components/tagger/compile.rs @@ -0,0 +1,165 @@ +use fs_err as fs; +use fs_err::File; + +use crate::compile::{BuildComponent, BuildInfo, Error}; +use crate::components::tagger::TaggerLangOptions; + +use super::*; +use serde::Deserialize; +use std::{ + collections::HashSet, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, +}; + +fn get_lines, S2: AsRef>( + paths: &[S1], + remove_paths: &[S2], +) -> std::io::Result> { + let mut output = Vec::new(); + let mut disallowed: Vec = Vec::new(); + + for path in remove_paths { + let file = File::open(path.as_ref())?; + let reader = std::io::BufReader::new(file); + + for line in reader.lines() { + let line = line?; + if line.starts_with('#') { + continue; + } + + disallowed.push(line.to_string()); + } + } + + for path in paths { + let file = File::open(path.as_ref())?; + let reader = std::io::BufReader::new(file); + + for line in reader.lines() { + let line = line?; + if line.starts_with('#') { + continue; + } + + if disallowed.contains(&line) { + continue; + } + + let parts: Vec<_> = line.split('\t').collect(); + + let word = parts[0].to_string(); + let inflection = parts[1].to_string(); + let tag = parts[2].to_string(); + + output.push((word, inflection, tag)) + } + } + + Ok(output) +} + +#[derive(Deserialize)] +pub struct Paths { + tag_dict: Vec, + tag_remove_dict: Vec, + common_words: PathBuf, + tagger_options: PathBuf, +} + +impl BuildComponent for Tagger { + type Paths = Paths; + + /// TODO: move and update + /// Creates a tagger from raw files. + /// + /// # Arguments + /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively, + /// separated by tabs, to be added to the tagger. + /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively, + /// separated by tabs, to be removed from the tagger if present in the files from `paths`. + fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result { + let options: TaggerLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.tagger_options)?))?; + let common_words: HashSet = fs::read_to_string(paths.common_words)? + .lines() + .map(ToOwned::to_owned) + .collect(); + + let mut tag_store = HashSet::new(); + let mut word_store = HashSet::new(); + + // add language specific special tags + tag_store.extend(options.extra_tags.iter().map(|x| x.as_str())); + + let lines = get_lines(&paths.tag_dict, &paths.tag_remove_dict)?; + + let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~"; + for i in 0..punct.len() { + word_store.insert(&punct[i..(i + 1)]); + } + + word_store.extend(common_words.iter().map(|x| x.as_str())); + + for (word, inflection, tag) in lines.iter() { + word_store.insert(word); + word_store.insert(inflection); + tag_store.insert(tag); + } + + // the empty string must not be part of any wordlist + assert!(!word_store.contains("")); + + // word store ids should be consistent across runs + let mut word_store: Vec<_> = word_store.into_iter().collect(); + word_store.sort_unstable(); + + // add special empty string to wordlist, must be the first element to have id 0 + word_store.insert(0, ""); + + // tag store ids should be consistent across runs + let mut tag_store: Vec<_> = tag_store.into_iter().collect(); + tag_store.sort_unstable(); + + // add special part of speech tags, they must have ids starting from zero + for (i, special_pos) in SpecialPos::iter().enumerate() { + tag_store.insert(i, special_pos); + } + + let word_store: BiMap<_, _> = word_store + .iter() + .enumerate() + .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32))) + .collect(); + let tag_store: BiMap<_, _> = tag_store + .iter() + .enumerate() + .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16))) + .collect(); + + let mut tags: Vec>> = vec![None; word_store.len()]; + + for (word, inflection, tag) in lines.iter() { + let word_id = word_store.get_by_left(word).unwrap(); + let lemma_id = word_store.get_by_left(inflection).unwrap(); + let pos_id = tag_store.get_by_left(tag).unwrap(); + + match &mut tags[word_id.value() as usize] { + Some(vec) => { + vec.push((*lemma_id, *pos_id)); + } + None => { + tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]); + } + } + } + + Ok(Tagger { + tags: WordIdMap(tags), + word_store, + tag_store, + lang_options: options, + }) + } +} diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/components/tagger/mod.rs similarity index 92% rename from nlprule/src/tokenizer/tag.rs rename to nlprule/src/components/tagger/mod.rs index c1ad8fc..4a88c0c 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/components/tagger/mod.rs @@ -1,6 +1,6 @@ //! A dictionary-based tagger. -use crate::{types::*, utils::parallelism::MaybeParallelRefIterator}; +use crate::{properties::*, types::*, utils::parallelism::MaybeParallelRefIterator}; use bimap::BiMap; use fst::{IntoStreamer, Map, Streamer}; use log::error; @@ -12,6 +12,9 @@ use std::{ iter::{once, FusedIterator}, }; +#[cfg(feature = "compile")] +mod compile; + #[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] #[serde(transparent)] pub(crate) struct WordIdInt(u32); @@ -58,6 +61,12 @@ impl<'t> fmt::Debug for WordId<'t> { } } +impl<'t> Default for WordId<'t> { + fn default() -> Self { + WordId::empty() + } +} + impl<'t> WordId<'t> { pub(crate) fn id(&self) -> &Option { &self.1 @@ -72,6 +81,13 @@ impl<'t> WordId<'t> { self.0.as_ref() } + pub fn as_ref_str(&self) -> &'t str { + match &self.0 { + Cow::Borrowed(x) => *x, + Cow::Owned(_) => panic!("can not get `&'t str` reference from owned Cow!"), + } + } + /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. pub fn into_static(self) -> WordId<'static> { WordId(self.0.into_owned().into(), self.1) @@ -397,11 +413,9 @@ impl WordIdMap { .iter() .enumerate() .filter_map(|(index, maybe_value)| { - if let Some(value) = maybe_value { - Some((WordIdInt(index as u32), value)) - } else { - None - } + maybe_value + .as_ref() + .map(|value| (WordIdInt(index as u32), value)) }) } } @@ -534,10 +548,10 @@ impl<'a> ExactSizeIterator for TagIter<'a> { #[derive(Default, Serialize, Deserialize, Clone)] #[serde(from = "TaggerFields", into = "TaggerFields")] pub struct Tagger { - pub(crate) tags: WordIdMap>, - pub(crate) tag_store: BiMap, - pub(crate) word_store: BiMap, - pub(crate) lang_options: TaggerLangOptions, + tags: WordIdMap>, + tag_store: BiMap, + word_store: BiMap, + lang_options: TaggerLangOptions, } impl Tagger { @@ -725,4 +739,42 @@ impl Tagger { pub fn get_tags<'a>(&'a self, word: &'a str) -> TagIter<'a> { self.get_tags_with_options(word, None, None) } + + pub fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + guard: PropertyGuardMut, + ) -> Result, crate::properties::Error> { + for token in sentence.iter_mut() { + let mut tag_vec: Vec<_> = self + .get_tags_with_options( + token.as_str(), + if token.is_sentence_start() { + Some(true) + } else { + None + }, + None, + ) + .collect(); + + tag_vec.push( + WordData::new( + self.id_word(token.as_str().into()), + PosId::special(SpecialPos::None), + ) + .freeze(), + ); + + if token.is_sentence_end() { + tag_vec.push( + WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), + ); + } + + *guard.tags_mut(token)? = Tags::new(self.id_word(token.as_str().into()), tag_vec); + } + + Ok(sentence) + } } diff --git a/nlprule/src/components/tokenizer/compile.rs b/nlprule/src/components/tokenizer/compile.rs new file mode 100644 index 0000000..63bb36c --- /dev/null +++ b/nlprule/src/components/tokenizer/compile.rs @@ -0,0 +1,45 @@ +use fs_err as fs; +use fs_err::File; +use std::{io::BufReader, path::PathBuf, str::FromStr}; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Deserialize)] +pub struct Paths { + tokenizer_options: PathBuf, + srx: PathBuf, + lang_code: PathBuf, +} + +impl BuildComponent for Tokenizer { + type Paths = Paths; + + fn build(paths: Paths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: TokenizerLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.tokenizer_options)?))?; + let lang_code = fs::read_to_string(paths.lang_code)?; + + let sentencizer = + srx::SRX::from_str(&fs::read_to_string(&paths.srx)?)?.language_rules(lang_code); + + let mut whitelist = DefaultHashSet::new(); + + for (word, _) in build_info.tagger().word_store() { + if word.contains(|c| options.extra_split_chars.contains(&c)) { + whitelist.insert(word.to_owned()); + } + } + + Ok(Tokenizer { + tagger: build_info.tagger().clone(), + sentencizer, + lang_options: options, + whitelist, + properties: Default::default(), + }) + } +} diff --git a/nlprule/src/components/tokenizer/mod.rs b/nlprule/src/components/tokenizer/mod.rs new file mode 100644 index 0000000..3ce20ff --- /dev/null +++ b/nlprule/src/components/tokenizer/mod.rs @@ -0,0 +1,238 @@ +//! A tokenizer to split raw text into tokens. +//! Tokens are assigned lemmas and part-of-speech tags by lookup from a [Tagger][tag::Tagger] and chunks containing +//! information about noun / verb and grammatical case by a statistical [Chunker][chunk::Chunker]. +//! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by +//! [DisambiguationRule][crate::rule::DisambiguationRule]s. + +#[cfg(feature = "compile")] +mod compile; + +use std::ops::Range; + +use crate::types::*; +use crate::{properties::*, utils::regex::Regex}; +use lazy_static::lazy_static; +use once_cell::sync::OnceCell; +use serde::{Deserialize, Serialize}; + +use super::{tagger::Tagger, Component}; + +/// Split a text at the points where the given function is true. +/// Keeps the separators. See https://stackoverflow.com/a/40296745. +fn split(text: &str, split_func: F) -> Vec<&str> +where + F: Fn(char) -> bool, +{ + let mut result = Vec::new(); + let mut last = 0; + for (index, matched) in text.match_indices(split_func) { + if last != index { + result.push(&text[last..index]); + } + result.push(matched); + last = index + matched.len(); + } + if last < text.len() { + result.push(&text[last..]); + } + + result +} + +/// Options for a tokenizer. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub(crate) struct TokenizerLangOptions { + /// Extra language-specific characters to split text on. + #[serde(default)] + pub extra_split_chars: Vec, + /// Extra language-specific Regexes of which the matches will *not* be split into multiple tokens. + #[serde(default)] + pub extra_join_regexes: Vec, +} + +/// The complete Tokenizer doing tagging, chunking and disambiguation. +#[derive(Serialize, Deserialize, Default, Clone)] +pub struct Tokenizer { + whitelist: DefaultHashSet, + sentencizer: srx::Rules, + tagger: Tagger, + lang_options: TokenizerLangOptions, + #[serde(skip)] + properties: OnceCell, +} + +impl Tokenize for Tokenizer { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); + } + *PROPERTIES + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + Box::new(SentenceIter { + text, + splits: self.sentencizer.split_ranges(text), + tokenizer: &self, + index: 0, + position: Position::default(), + }) + } + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { + if sentence.trim().is_empty() { + return None; + } + + let token_strs = self + .get_token_ranges(sentence) + .filter(|range| !sentence[range.clone()].trim().is_empty()); + + let n_token_strs = token_strs.clone().count(); + + let tokens: Vec<_> = token_strs + .enumerate() + .map(|(i, range)| { + let byte_start = range.start; + let char_start = sentence[..byte_start].chars().count(); + + let token_text = sentence[range].trim(); + + let is_sentence_start = i == 0; + let is_sentence_end = i == n_token_strs - 1; + + Token::new( + token_text, + Span::new( + byte_start..byte_start + token_text.len(), + char_start..char_start + token_text.chars().count(), + ), + is_sentence_start, + is_sentence_end, + sentence[..byte_start].ends_with(char::is_whitespace), + ) + }) + .collect(); + + let mut sentence = Sentence::new(tokens, sentence, &self.tagger); + let guard = self.property_guard(&mut sentence).expect("TODO"); + + sentence = self.tagger.transform(sentence, guard).expect("TODO"); + + Some(sentence) + } +} + +/// An iterator over sentences. Has some key properties: +/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero. +/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`. +/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence. +pub struct SentenceIter<'t> { + text: &'t str, + splits: Vec>, + tokenizer: &'t Tokenizer, + index: usize, + position: Position, +} + +impl<'t> Iterator for SentenceIter<'t> { + type Item = Sentence<'t>; + + fn next(&mut self) -> Option { + if self.index == self.splits.len() { + return None; + } + + let mut range = self.splits[self.index].clone(); + self.index += 1; + + // as long as the current sentence contains only whitespace, add the next sentence + // in practice, this might never happen, but we can not make any assumption about + // SRX rule behavior here. + while self.text[range.clone()].trim().is_empty() && self.index < self.splits.len() { + range.end = self.splits[self.index].end; + self.index += 1; + } + + let sentence = self + .tokenizer + .tokenize_sentence(&self.text[range.clone()]) + .map(|x| x.rshift(self.position)); + + self.position += Position { + char: self.text[range.clone()].chars().count(), + byte: range.len(), + }; + + sentence + } +} + +impl Component for Tokenizer { + fn name() -> &'static str { + "tokenizer" + } +} + +impl Tokenizer { + /// Gets the lexical tagger. + pub fn tagger(&self) -> &Tagger { + &self.tagger + } + + fn get_token_ranges<'t>( + &self, + text: &'t str, + ) -> impl ExactSizeIterator> + 't + Clone { + let mut tokens = Vec::new(); + + let split_char = |c: char| c.is_whitespace() || crate::utils::splitting_chars().contains(c); + let split_text = |text: &'t str| { + let mut tokens = Vec::new(); + for pretoken in split(text, split_char) { + // if the token is in the dictionary, we add it right away + if self.whitelist.contains(pretoken) { + tokens.push(pretoken); + } else { + // otherwise, potentially split it again with `extra_split_chars` e. g. "-" + tokens.extend(split(pretoken, |c| { + split_char(c) || self.lang_options.extra_split_chars.contains(&c) + })); + } + } + tokens + }; + + let mut joined_mask = vec![false; text.len()]; + let mut joins = Vec::new(); + + for regex in self.lang_options.extra_join_regexes.iter() { + for mat in regex.find_iter(text) { + if !joined_mask[mat.start()..mat.end()].iter().any(|x| *x) { + joins.push(mat.start()..mat.end()); + joined_mask[mat.start()..mat.end()] + .iter_mut() + .for_each(|x| *x = true); + } + } + } + + joins.sort_by(|a, b| a.start.cmp(&b.start)); + + let mut prev = 0; + for range in joins { + tokens.extend(split_text(&text[prev..range.start])); + prev = range.end; + tokens.push(&text[range]); + } + + tokens.extend(split_text(&text[prev..text.len()])); + tokens.into_iter().map(move |token| { + let byte_start = (token.as_ptr() as usize) + .checked_sub(text.as_ptr() as usize) + .expect("Each token str is a slice of the text str."); + + byte_start..byte_start + token.len() + }) + } +} diff --git a/nlprule/src/lang.rs b/nlprule/src/lang.rs new file mode 100644 index 0000000..f627fef --- /dev/null +++ b/nlprule/src/lang.rs @@ -0,0 +1,37 @@ +use std::path::{Path, PathBuf}; + +const MANIFEST_DIR: &str = env!("CARGO_MANIFEST_DIR"); + +pub fn binary_path(lang_code: &str, name: &str) -> PathBuf { + Path::new(MANIFEST_DIR) + .join(lang_code) + .join(format!("{}.bin", name)) +} + +#[allow(unused)] +macro_rules! binary { + ($component: ty, $lang_code:literal, $name:literal) => {{ + use crate::components::Component; + + let mut bytes: &'static [u8] = include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/storage/", + $lang_code, + "/", + $name, + ".bin" + )); + + <$component>::from_reader(&mut bytes) + }}; +} + +#[allow(unused)] +const ERROR_MSG: &str = "binaries are pre-tested."; + +#[cfg(feature = "binaries-de")] +pub mod de; +#[cfg(feature = "binaries-en")] +pub mod en; +#[cfg(feature = "binaries-es")] +pub mod es; diff --git a/nlprule/src/lang/de.rs b/nlprule/src/lang/de.rs new file mode 100644 index 0000000..6e3d337 --- /dev/null +++ b/nlprule/src/lang/de.rs @@ -0,0 +1,31 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "de", "tokenizer").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "de", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "de", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), disambiguator())).expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/nlprule/src/lang/en.rs b/nlprule/src/lang/en.rs new file mode 100644 index 0000000..5dfd743 --- /dev/null +++ b/nlprule/src/lang/en.rs @@ -0,0 +1,42 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + chunker::Chunker, + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Chunker, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "en", "tokenizer").expect(ERROR_MSG) +} + +pub fn multiword_tagger() -> MultiwordTagger { + binary!(MultiwordTagger, "en", "tokenizer").expect(ERROR_MSG) +} + +pub fn chunker() -> Chunker { + binary!(Chunker, "en", "chunker").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "en", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "en", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), multiword_tagger(), chunker(), disambiguator())) + .expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/nlprule/src/lang/es.rs b/nlprule/src/lang/es.rs new file mode 100644 index 0000000..5f4194a --- /dev/null +++ b/nlprule/src/lang/es.rs @@ -0,0 +1,36 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "es", "tokenizer").expect(ERROR_MSG) +} + +pub fn multiword_tagger() -> MultiwordTagger { + binary!(MultiwordTagger, "es", "multiword_tagger").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "es", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "es", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), multiword_tagger(), disambiguator())).expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index 585c591..4e4edf4 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -10,13 +10,12 @@ //! Correct a text: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules}; +//! use nlprule::lang::en; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let rules = Rules::new("path/to/en_rules.bin")?; +//! let correcter = en::correcter(); //! //! assert_eq!( -//! rules.correct("She was not been here since Monday.", &tokenizer), +//! correcter.correct("She was not been here since Monday.").collect::>().join(""), //! String::from("She was not here since Monday.") //! ); //! # Ok::<(), nlprule::Error>(()) @@ -25,64 +24,58 @@ //! Get suggestions and correct a text: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules, types::Suggestion, rules::apply_suggestions}; +//! use nlprule::lang::en; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let rules = Rules::new("path/to/en_rules.bin")?; +//! let correcter = en::correcter(); //! //! let text = "She was not been here since Monday."; //! -//! let suggestions = rules.suggest(text, &tokenizer); +//! let suggestions = correcter.suggest(text).next().expect("`text` contains one sentence."); //! assert_eq!(*suggestions[0].span().char(), 4usize..16); //! assert_eq!(suggestions[0].replacements(), vec!["was not", "has not been"]); //! assert_eq!(suggestions[0].source(), "GRAMMAR/WAS_BEEN/1"); //! assert_eq!(suggestions[0].message(), "Did you mean was not or has not been?"); //! -//! let corrected = apply_suggestions(text, &suggestions); -//! -//! assert_eq!(corrected, "She was not here since Monday."); //! # Ok::<(), nlprule::Error>(()) //! ``` //! //! Tokenize & analyze a text: //! //! ```no_run -//! use nlprule::Tokenizer; +//! use nlprule::lang::en; +//! use nlprule::properties::Tokenize; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; +//! let analyzer = en::analyzer(); //! //! let text = "A brief example is shown."; //! //! // returns an iterator over sentences -//! let sentence = tokenizer.pipe(text).next().expect("`text` contains one sentence."); +//! let sentence = analyzer.tokenize(text).next().expect("`text` contains one sentence."); //! //! println!("{:#?}", sentence); -//! assert_eq!(sentence.tokens()[1].word().text().as_str(), "brief"); -//! assert_eq!(sentence.tokens()[1].word().tags()[0].pos().as_str(), "JJ"); -//! assert_eq!(sentence.tokens()[1].chunks(), vec!["I-NP-singular"]); +//! assert_eq!(sentence.tokens()[1].as_str(), "brief"); +//! assert_eq!(sentence.tokens()[1].tags()?.iter().next().unwrap().pos().as_str(), "JJ"); +//! assert_eq!(sentence.tokens()[1].chunks()?, &["I-NP-singular"]); //! // some other information like char / byte span, lemmas etc. is also set! //! # Ok::<(), nlprule::Error>(()) //! ``` -//! --- -//! Binaries are distributed with [Github releases](https://github.com/bminixhofer/nlprule/releases). -#![warn(missing_docs)] +// #![warn(missing_docs)] use std::io; use thiserror::Error; #[cfg(feature = "compile")] pub mod compile; +pub mod components; mod filter; +#[macro_use] +pub mod lang; +pub mod properties; pub mod rule; -pub mod rules; -pub mod tokenizer; pub mod types; pub(crate) mod utils; -pub use rules::Rules; -pub use tokenizer::Tokenizer; - #[derive(Error, Debug)] #[allow(missing_docs)] pub enum Error { @@ -93,30 +86,8 @@ pub enum Error { Serialization(#[from] bincode::Error), #[error(transparent)] IdError(#[from] rule::id::Error), -} - -/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. -pub fn tokenizer_filename(lang_code: &str) -> String { - format!("{}_tokenizer.bin", lang_code) -} - -/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format. -pub fn rules_filename(lang_code: &str) -> String { - format!("{}_rules.bin", lang_code) -} - -/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. -#[macro_export] -macro_rules! tokenizer_filename { - ($lang_code:literal) => { - concat!($lang_code, "_tokenizer.bin") - }; -} - -/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format. -#[macro_export] -macro_rules! rules_filename { - ($lang_code:literal) => { - concat!($lang_code, "_rules.bin") - }; + #[error(transparent)] + Property(#[from] properties::Error), + #[error("Test failed. See logs for details.")] + TestFailed, } diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs new file mode 100644 index 0000000..de151e0 --- /dev/null +++ b/nlprule/src/properties.rs @@ -0,0 +1,687 @@ +use serde::{Deserialize, Serialize}; + +use crate::types::*; +use thiserror::Error; + +pub use suggest::Suggest; +pub use tokenize::Tokenize; +pub use transform::Transform; + +pub mod suggest { + use super::*; + + /// Correct a text by applying suggestions to it. + /// In the case of multiple possible replacements, always chooses the first one. + pub fn apply_suggestions(sentence: &Sentence, suggestions: &[Suggestion]) -> String { + let mut offset: isize = -(sentence.span().char().start as isize); + let mut chars: Vec<_> = sentence.text().chars().collect(); + + for suggestion in suggestions { + let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); + chars.splice( + (suggestion.span().char().start as isize + offset) as usize + ..(suggestion.span().char().end as isize + offset) as usize, + replacement.iter().cloned(), + ); + offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; + } + + chars.into_iter().collect() + } + + pub trait Suggest { + fn properties(&self) -> Properties { + Properties::default() + } + + fn property_guard(&self, sentence: &Sentence) -> Result { + self.properties().build(sentence) + } + + fn suggest(&self, sentence: &Sentence) -> Result, Error>; + + fn correct(&self, sentence: &Sentence) -> Result { + let suggestions = self.suggest(sentence)?; + Ok(apply_suggestions(&sentence, &suggestions)) + } + + #[allow(unused_variables)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + Ok(()) + } + } + + impl<'a, T> Suggest for &'a T + where + T: Suggest, + { + fn properties(&self) -> Properties { + (*self).properties() + } + + fn property_guard(&self, sentence: &Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn suggest(&self, sentence: &Sentence) -> Result, Error> { + (*self).suggest(sentence) + } + + fn correct(&self, sentence: &Sentence) -> Result { + (*self).correct(sentence) + } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + (*self).test(tokenizer) + } + } +} + +pub mod transform { + use super::*; + + pub trait Transform { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } + + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error>; + + #[allow(unused_variables)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + Ok(()) + } + } + + impl<'a, T> Transform for &'a T + where + T: Transform, + { + fn properties(&self) -> PropertiesMut { + (*self).properties() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error> { + (*self).transform(sentence) + } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + (*self).test(tokenizer) + } + } + + #[derive(Serialize, Deserialize, Clone)] + pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); +} + +pub mod tokenize { + use super::*; + + pub trait Tokenize { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't>; + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option>; + + fn test(&self) -> Result<(), crate::Error> { + Ok(()) + } + } + + impl<'a, T> Tokenize for &'a T + where + T: Tokenize, + { + fn properties(&self) -> PropertiesMut { + (*self).properties() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + (*self).tokenize(text) + } + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { + (*self).tokenize_sentence(sentence) + } + + fn test(&self) -> Result<(), crate::Error> { + (*self).test() + } + } + + #[derive(Serialize, Deserialize, Clone)] + pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct Pipeline(T, PropertiesMut); + +impl transform::Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl tokenize::Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +#[derive(Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("unset token property: {0:?}.")] + Unset(Property), + #[error("invalid pipeline: properties {0:?} are read without being written.")] + InvalidPipeline(Vec), +} + +#[derive(Debug, Clone, Copy)] +pub enum Property { + Tags = 0, + Chunks = 1, +} + +impl Property { + pub fn properties() -> &'static [Property] { + &[Property::Tags, Property::Chunks] + } +} + +#[derive(Debug, Copy, Clone, Serialize, Deserialize, Default)] +struct Bitset(u16); + +impl Bitset { + pub fn insert(&mut self, value: Property) { + self.0 |= 1 << (value as u16); + } + + pub fn contains(&self, value: &Property) -> bool { + self.0 & (1 << (*value as u16)) != 0 + } + + pub fn union(mut self, other: Bitset) -> Self { + self.0 |= other.0; + self + } + + pub fn intersection(mut self, other: Bitset) -> Self { + self.0 &= other.0; + self + } + + pub fn inverse(mut self) -> Self { + self.0 = !self.0; + self + } + + pub fn into_iter<'a>(self) -> impl Iterator + 'a { + Property::properties().iter().filter_map(move |property| { + if self.contains(property) { + Some(*property) + } else { + None + } + }) + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct Properties { + read_mask: Bitset, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct PropertiesMut { + read_mask: Bitset, + write_mask: Bitset, +} + +impl std::iter::FromIterator for Properties { + fn from_iter>(iter: T) -> Self { + let mut out = Properties::default(); + + for properties in iter { + out = out.union(properties) + } + + out + } +} + +impl std::iter::FromIterator for PropertiesMut { + fn from_iter>(iter: T) -> Self { + let mut out = PropertiesMut::default(); + + for properties in iter { + out = out.union(properties) + } + + out + } +} + +impl Properties { + pub fn read(mut self, properties: &[Property]) -> Self { + for property in properties { + self.read_mask.insert(*property); + } + + self + } + + pub fn write(self, properties: &[Property]) -> PropertiesMut { + let mut write_mask = Bitset::default(); + let mut read_mask = self.read_mask; + + for property in properties { + // write implies read + read_mask.insert(*property); + write_mask.insert(*property); + } + + PropertiesMut { + read_mask, + write_mask, + } + } + + pub fn union(mut self, properties: Properties) -> Self { + self.read_mask = self.read_mask.union(properties.read_mask); + + self + } + + pub fn build(&self, sentence: &Sentence) -> Result { + for property in Property::properties() { + if self.read_mask.contains(property) { + match *property { + Property::Tags => { + if sentence.first().tags.is_none() { + return Err(Error::Unset(Property::Tags)); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + return Err(Error::Unset(Property::Chunks)); + } + } + } + } + } + + Ok(PropertyGuard { + read_mask: self.read_mask, + }) + } +} + +impl PropertiesMut { + pub(crate) fn reads_without_write(&self) -> impl Iterator { + self.read_mask + .intersection(self.write_mask.inverse()) + .into_iter() + } + + pub fn union(mut self, properties: PropertiesMut) -> Self { + self.read_mask = self.read_mask.union(properties.read_mask); + self.write_mask = self.write_mask.union(properties.read_mask); + + self + } + + pub fn chain(mut self, next: PropertiesMut) -> Self { + let next_reads = next.read_mask.intersection(next.write_mask.inverse()); + let new_reads = next_reads.intersection(self.write_mask.inverse()); + + self.read_mask = self.read_mask.union(new_reads); + self.write_mask = self.write_mask.union(next.write_mask); + self + } + + pub fn build(&self, sentence: &mut Sentence) -> Result { + for property in Property::properties() { + if self.write_mask.contains(property) { + match property { + Property::Tags => { + if sentence.first().tags.is_none() { + sentence + .iter_mut() + .for_each(|token| token.tags = Some(Tags::default())); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + sentence + .iter_mut() + .for_each(|token| token.chunks = Some(Vec::default())); + } + } + } + } + } + + for property in Property::properties() { + if self.read_mask.contains(property) { + match *property { + Property::Tags => { + if sentence.first().tags.is_none() { + return Err(Error::Unset(Property::Tags)); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + return Err(Error::Unset(Property::Chunks)); + } + } + } + } + } + + Ok(PropertyGuardMut { + read_mask: self.read_mask, + write_mask: self.write_mask, + }) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct PropertyGuard { + read_mask: Bitset, +} + +#[derive(Debug, Copy, Clone)] +pub struct PropertyGuardMut { + read_mask: Bitset, + write_mask: Bitset, +} + +impl PropertyGuard { + pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> { + match ( + token.chunks.as_deref(), + self.read_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> { + match ( + token.tags.as_ref(), + self.read_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } +} + +impl PropertyGuardMut { + pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> { + match ( + token.chunks.as_deref(), + self.read_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> { + match ( + token.tags.as_ref(), + self.read_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } + + pub fn chunks_mut<'a, 't>( + &self, + token: &'a mut Token<'t>, + ) -> Result<&'a mut Vec, Error> { + match ( + token.chunks.as_mut(), + self.write_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags_mut<'a, 't>(&self, token: &'a mut Token<'t>) -> Result<&'a mut Tags<'t>, Error> { + match ( + token.tags.as_mut(), + self.write_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } + + pub fn downgrade(self) -> PropertyGuard { + PropertyGuard { + read_mask: self.read_mask, + } + } +} + +pub trait CreatePipe: Sized { + fn new(components: T) -> Result; +} + +macro_rules! make_subpipe { + ($pipe:ty, $first:expr) => { + Ok::<_, crate::Error>($first) + }; + ($pipe:ty, $first:expr, $($name:expr),+) => { + <$pipe>::new(($first, $($name,)+)) + } +} + +macro_rules! impl_pipeline { + ( $first:ident, $last:ident, $($name:ident),*) => { + // Case 1: Tokenize -> Transform -> ... -> Transform + impl<$first: Tokenize, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for tokenize::Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties()); + + if !properties.reads_without_write().next().is_none() { + return Err(Error::InvalidPipeline(properties.reads_without_write().collect())); + } + + Ok(tokenize::Pipeline(components, properties)) + } + } + + impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for tokenize::Pipeline<($first, $($name,)* $last)> { + fn properties(&self) -> PropertiesMut { + self.1 + } + + #[allow(non_snake_case)] + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + sentence = $last.transform(sentence).unwrap(); + sentence + }); + + Box::new(sentences) + } + + #[allow(non_snake_case, unused_mut)] + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option { + let (ref $first, $(ref $name,)* ref $last) = self.0; + let mut sentence = $first.tokenize_sentence(sentence)?; + $(sentence = $name.transform(sentence).unwrap();)* + Some($last.transform(sentence).unwrap()) + } + + #[allow(non_snake_case)] + fn test(&self) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?; + subpipe.test()?; + + $last.test(subpipe)?; + + Ok(()) + } + } + + // Case 2: Transform -> ... -> Transform + impl<$first: Transform, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for transform::Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties()); + + Ok(transform::Pipeline(components, properties)) + } + } + + impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for transform::Pipeline<($first, $($name,)* $last)> { + fn properties(&self) -> PropertiesMut { + self.1 + } + + #[allow(non_snake_case)] + fn transform<'t>(&'t self, mut sentence: Sentence<'t>) -> Result, crate::properties::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + sentence = $first.transform(sentence)?; + $(sentence = $name.transform(sentence)?;)* + sentence = $last.transform(sentence)?; + Ok(sentence) + } + + #[allow(non_snake_case)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + $first.test(&tokenizer)?; + let tokenizer_pipe = tokenize::Pipeline::new((&tokenizer, $first))?; + let subpipe = make_subpipe!(transform::Pipeline<_>, $($name,)* $last)?; + + subpipe.test(tokenizer_pipe)?; + Ok(()) + } + } + + // Case 3: Tokenize -> Transform -> ... -> Transform -> Suggest + impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> CreatePipe<($first, $($name,)* $last)> for Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties().write(&[])); + + if !properties.reads_without_write().next().is_none() { + return Err(Error::InvalidPipeline(properties.reads_without_write().collect())); + } + + Ok(Pipeline(components, properties)) + } + } + + impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> { + pub fn properties(&self) -> PropertiesMut { + self.1 + } + + #[allow(non_snake_case, unused_mut)] + pub fn suggest<'t>(&'t self, text: &'t str) -> impl Iterator> + 't { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + $last.suggest(&sentence).unwrap() + }); + + sentences + } + + #[allow(non_snake_case, unused_mut)] + pub fn correct<'t>(&'t self, text: &'t str) -> impl Iterator + 't { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + $last.correct(&sentence).unwrap() + }); + + sentences + } + + #[allow(non_snake_case)] + pub fn test(&self) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?; + subpipe.test()?; + + $last.test(subpipe)?; + + Ok(()) + } + } + }; +} + +impl_pipeline! { A, B, } +impl_pipeline! { A, C, B } +impl_pipeline! { A, D, B, C } +impl_pipeline! { A, E, B, C, D } diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs index 207c289..c684a53 100644 --- a/nlprule/src/rule/disambiguation.rs +++ b/nlprule/src/rule/disambiguation.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::types::*; +use crate::{properties::PropertyGuardMut, types::*}; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -52,21 +52,25 @@ pub enum Disambiguation { } impl Disambiguation { - pub fn apply<'t>(&'t self, groups: Vec>>) { + pub fn apply<'t>( + &'t self, + groups: Vec>>, + guard: PropertyGuardMut, + ) -> Result<(), crate::properties::Error> { match self { Disambiguation::Remove(data_or_filters) => { for (group, data_or_filter) in groups.into_iter().zip(data_or_filters) { for token in group.into_iter() { match data_or_filter { either::Left(data) => { - token.tags_mut().retain(|x| { + guard.tags_mut(token)?.retain(|x| { !(x.pos() == data.pos() && (data.lemma().as_str().is_empty() || x.lemma() == data.lemma())) }); } either::Right(filter) => { - filter.remove(token.tags_mut()); + filter.remove(guard.tags_mut(token)?); } } } @@ -78,31 +82,32 @@ impl Disambiguation { match data_or_filter { either::Left(limit) => { for token in group.into_iter() { - let last = token - .tags() - .iter() - .next() - .and_then(|x| { - if *x.lemma() != WordId::empty() { - Some(x.lemma().clone()) - } else { - None - } - }) - .unwrap_or_else(|| token.text().clone()); - - token.tags_mut().retain(|x| x.pos() == limit.pos()); - - if token.tags().is_empty() { + let last = { + let tags = guard.tags(token)?; + tags.iter() + .next() + .and_then(|x| { + if *x.lemma() != WordId::empty() { + Some(x.lemma().clone()) + } else { + None + } + }) + .unwrap_or_else(|| tags.id().clone()) + }; + + guard.tags_mut(token)?.retain(|x| x.pos() == limit.pos()); + + if guard.tags(token)?.is_empty() { if *retain_last { - token - .tags_mut() + guard + .tags_mut(token)? .push(WordData::new(last, limit.pos().clone())); } else { - let lemma = token.text().clone(); + let lemma = guard.tags(token)?.id().clone(); - token - .tags_mut() + guard + .tags_mut(token)? .push(WordData::new(lemma, limit.pos().clone())); } } @@ -110,7 +115,7 @@ impl Disambiguation { } either::Right(filter) => { for token in group.into_iter() { - filter.keep(token.tags_mut()); + filter.keep(guard.tags_mut(token)?); } } } @@ -122,15 +127,17 @@ impl Disambiguation { for token in group.into_iter() { let data = WordData::new( if data.lemma().as_str().is_empty() { - token.text().clone() + guard.tags(token)?.id().clone() } else { data.lemma().clone() }, data.pos().clone(), ); - token.tags_mut().push(data); - token.tags_mut().retain(|x| !x.pos().as_str().is_empty()); + let tags = guard.tags_mut(token)?; + + tags.push(data); + tags.retain(|x| !x.pos().as_str().is_empty()); } } } @@ -139,15 +146,17 @@ impl Disambiguation { for token in group.into_iter() { let data = WordData::new( if data.lemma().as_str().is_empty() { - token.text().clone() + guard.tags(token)?.id().clone() } else { data.lemma().clone() }, data.pos().clone(), ); - token.tags_mut().clear(); - token.tags_mut().push(data); + let tags = guard.tags_mut(token)?; + + tags.clear(); + tags.push(data); } } } @@ -160,14 +169,14 @@ impl Disambiguation { for token in group.iter() { if *use_mask_val { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()); + *mask_val = *mask_val && PosFilter::and(filter, guard.tags(token)?); } } } } if !filter_mask.iter().any(|x| *x) { - return; + return Ok(()); } let to_apply: Vec<_> = filter_mask @@ -188,16 +197,17 @@ impl Disambiguation { { if *use_mask_val { for token in group.into_iter() { - let before = token.tags().clone(); + let tags = guard.tags_mut(token)?; + let before = tags.clone(); - PosFilter::apply(&to_apply, token.tags_mut()); + PosFilter::apply(&to_apply, tags); if let Some(disambig) = disambig { - disambig.keep(token.tags_mut()); + disambig.keep(tags); } - if token.tags().is_empty() { - *token.tags_mut() = before; + if tags.is_empty() { + *tags = before; } } } @@ -205,6 +215,8 @@ impl Disambiguation { } Disambiguation::Nop => {} } + + Ok(()) } } diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 6e042de..d0fdef5 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -1,6 +1,6 @@ use std::iter; -use crate::{tokenizer::tag::Tagger, types::*, utils::regex::Regex}; +use crate::{components::tagger::Tagger, properties::*, types::*, utils::regex::Regex}; use enum_dispatch::enum_dispatch; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -161,10 +161,15 @@ pub struct Quantifier { #[enum_dispatch] pub trait Atomable: Send + Sync { - fn is_match(&self, context: Context, position: usize) -> bool; + fn is_match(&self, context: Context, position: usize) + -> Result; + + fn compute_properties(&self) -> Properties { + Properties::default() + } } -#[enum_dispatch(Atomable)] +#[enum_dispatch(Atomable, ReadProperties)] #[derive(Debug, Serialize, Deserialize, Clone)] pub enum Atom { ChunkAtom(concrete::ChunkAtom), @@ -180,7 +185,8 @@ pub enum Atom { } pub mod concrete { - use super::{Atomable, Context, Matcher, TextMatcher, WordDataMatcher}; + use super::{Atomable, Context, Matcher, Properties, Property, TextMatcher, WordDataMatcher}; + use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize, Clone)] @@ -189,11 +195,25 @@ pub mod concrete { } impl Atomable for TextAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; - self.matcher - .is_match(&sentence.index(position).text(), Some(context), None) + Ok(self.matcher.is_match( + sentence.guard().tags(sentence.index(position))?.id(), + Some(context), + None, + )) + } + + fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES } } @@ -203,11 +223,25 @@ pub mod concrete { } impl Atomable for ChunkAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; - self.matcher - .is_slice_match(&sentence.index(position).chunks(), Some(context), None) + Ok(self.matcher.is_slice_match( + sentence.guard().chunks(sentence.index(position))?, + Some(context), + None, + )) + } + + fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Chunks]); + } + *PROPERTIES } } @@ -217,10 +251,14 @@ pub mod concrete { } impl Atomable for SpaceBeforeAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; - sentence.index(position).has_space_before() == self.value + Ok(sentence.index(position).has_space_before() == self.value) } } @@ -231,12 +269,24 @@ pub mod concrete { } impl Atomable for WordDataAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; - let tags = sentence.index(position).tags().iter(); + let tags = sentence.guard().tags(sentence.index(position))?.iter(); - self.matcher - .is_match(tags, Some(context), Some(self.case_sensitive)) + Ok(self + .matcher + .is_match(tags, Some(context), Some(self.case_sensitive))) + } + + fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES } } } @@ -245,8 +295,12 @@ pub mod concrete { pub struct TrueAtom {} impl Atomable for TrueAtom { - fn is_match(&self, _context: Context, _position: usize) -> bool { - true + fn is_match( + &self, + _context: Context, + _position: usize, + ) -> Result { + Ok(true) } } @@ -254,8 +308,12 @@ impl Atomable for TrueAtom { pub struct FalseAtom {} impl Atomable for FalseAtom { - fn is_match(&self, _context: Context, _position: usize) -> bool { - false + fn is_match( + &self, + _context: Context, + _position: usize, + ) -> Result { + Ok(false) } } @@ -265,8 +323,22 @@ pub struct AndAtom { } impl Atomable for AndAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - self.atoms.iter().all(|x| x.is_match(context, position)) + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { + for atom in &self.atoms { + if !atom.is_match(context, position)? { + return Ok(false); + } + } + + Ok(true) + } + + fn compute_properties(&self) -> Properties { + self.atoms.iter().map(Atom::compute_properties).collect() } } @@ -276,8 +348,22 @@ pub struct OrAtom { } impl Atomable for OrAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - self.atoms.iter().any(|x| x.is_match(context, position)) + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { + for atom in &self.atoms { + if atom.is_match(context, position)? { + return Ok(true); + } + } + + Ok(false) + } + + fn compute_properties(&self) -> Properties { + self.atoms.iter().map(Atom::compute_properties).collect() } } @@ -287,8 +373,16 @@ pub struct NotAtom { } impl Atomable for NotAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - !self.atom.is_match(context, position) + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { + Ok(!self.atom.is_match(context, position)?) + } + + fn compute_properties(&self) -> Properties { + self.atom.compute_properties() } } @@ -299,15 +393,25 @@ pub struct OffsetAtom { } impl Atomable for OffsetAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; let new_position = position as isize + self.offset; - if new_position < 0 || (new_position as usize) >= sentence.len() { - false - } else { - self.atom.is_match(context, new_position as usize) - } + Ok( + if new_position < 0 || (new_position as usize) >= sentence.len() { + false + } else { + self.atom.is_match(context, new_position as usize)? + }, + ) + } + + fn compute_properties(&self) -> Properties { + self.atom.compute_properties() } } @@ -357,33 +461,20 @@ impl GraphId { } } -lazy_static! { - static ref SENT_START: Token<'static> = Token::new( - WordId::empty(), - Tags::new(vec![WordData::new( - WordId::empty(), - PosId::special(SpecialPos::SentStart), - )],), - Span::default(), - false, - false, - Vec::new(), - ); -} - -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct MatchSentence<'t> { sentence: &'t Sentence<'t>, + guard: PropertyGuard, } impl<'t> MatchSentence<'t> { - pub fn new(sentence: &'t Sentence<'t>) -> Self { - MatchSentence { sentence } + pub fn new(sentence: &'t Sentence<'t>, guard: PropertyGuard) -> Self { + MatchSentence { sentence, guard } } pub fn index(&self, index: usize) -> &Token { match index { - 0 => &*SENT_START, + 0 => &crate::types::SENT_START, i => &self.sentence.tokens()[i - 1], } } @@ -409,6 +500,10 @@ impl<'t> MatchSentence<'t> { self.sentence.tagger() } + pub fn guard(&self) -> &PropertyGuard { + &self.guard + } + pub fn span(&self) -> &Span { self.sentence.span() } @@ -518,7 +613,19 @@ pub struct Composition { } impl Composition { - fn next_can_match(&self, context: Context, position: usize, index: usize) -> bool { + pub fn compute_properties(&self) -> Properties { + self.parts + .iter() + .map(|part| part.atom.compute_properties()) + .collect() + } + + fn next_can_match( + &self, + context: Context, + position: usize, + index: usize, + ) -> Result { let next_required_pos = match self.parts[index + 1..] .iter() .position(|x| x.quantifier.min > 0) @@ -527,9 +634,13 @@ impl Composition { None => self.parts.len(), }; - self.parts[index + 1..next_required_pos] - .iter() - .any(|x| x.atom.is_match(context, position)) + for part in &self.parts[index + 1..next_required_pos] { + if part.atom.is_match(context, position)? { + return Ok(true); + } + } + + Ok(false) } fn apply_recursive<'t>( @@ -538,7 +649,7 @@ impl Composition { mut position: usize, mut cur_atom_idx: usize, mut graph: MatchGraph<'t>, - ) -> Option> { + ) -> Result>, crate::properties::Error> { let mut cur_count = 0; let is_match = loop { if cur_atom_idx >= self.parts.len() { @@ -561,21 +672,23 @@ impl Composition { } if cur_count >= part.quantifier.min && cur_atom_idx + 1 < self.parts.len() { - if !part.greedy && self.next_can_match((sentence, &graph), position, cur_atom_idx) { + if !part.greedy + && self.next_can_match((sentence, &graph), position, cur_atom_idx)? + { cur_atom_idx += 1; cur_count = 0; continue; } if part.greedy { if let Some(graph) = - self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone()) + self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone())? { - return Some(graph); + return Ok(Some(graph)); } } } - if part.atom.is_match((sentence, &graph), position) { + if part.atom.is_match((sentence, &graph), position)? { let group = &mut graph.groups[cur_atom_idx + 1]; // set the group beginning if the char end was zero (i. e. the group was empty) @@ -599,19 +712,21 @@ impl Composition { cur_atom_idx += 1; } - if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] { - graph.fill_empty(sentence); - Some(graph) - } else { - None - } + Ok( + if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] { + graph.fill_empty(sentence); + Some(graph) + } else { + None + }, + ) } pub fn apply<'t>( &'t self, sentence: &'t MatchSentence, start: usize, - ) -> Option> { + ) -> Result>, crate::properties::Error> { // this path is extremely hot so more optimizations are done // the first matcher can never rely on the match graph, so we use an empty default graph for the first match @@ -623,9 +738,9 @@ impl Composition { if self.parts[0].quantifier.min > 0 && !self.parts[0] .atom - .is_match((sentence, &DEFAULT_GRAPH), start) + .is_match((sentence, &DEFAULT_GRAPH), start)? { - return None; + return Ok(None); } let position = start; diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs index 22ec069..ceda355 100644 --- a/nlprule/src/rule/engine/mod.rs +++ b/nlprule/src/rule/engine/mod.rs @@ -1,4 +1,7 @@ +use std::iter; + use crate::{ + properties::*, types::*, utils::regex::{CaptureMatches, Regex}, }; @@ -14,14 +17,18 @@ pub struct TokenEngine { } impl TokenEngine { - fn get_match<'t>(&'t self, sentence: &'t MatchSentence, i: usize) -> Option> { - if let Some(graph) = self.composition.apply(sentence, i) { + fn get_match<'t>( + &'t self, + sentence: &'t MatchSentence, + i: usize, + ) -> Result>, crate::properties::Error> { + if let Some(graph) = self.composition.apply(sentence, i)? { let mut blocked = false; // TODO: cache / move to outer loop for i in 0..sentence.len() { for antipattern in &self.antipatterns { - if let Some(anti_graph) = antipattern.apply(sentence, i) { + if let Some(anti_graph) = antipattern.apply(sentence, i)? { let anti_start = anti_graph.by_index(0).span.char().start; let anti_end = anti_graph .by_index(anti_graph.groups().len() - 1) @@ -44,11 +51,11 @@ impl TokenEngine { } if !blocked { - return Some(graph); + return Ok(Some(graph)); } } - None + Ok(None) } } @@ -84,7 +91,7 @@ pub struct EngineMatches<'a, 't> { } impl<'a, 't> Iterator for EngineMatches<'a, 't> { - type Item = MatchGraph<'t>; + type Item = Result, crate::properties::Error>; fn next(&mut self) -> Option { let sentence = self.sentence; @@ -93,22 +100,25 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { match &mut self.inner { InnerMatches::Token(inner) => (inner.index..sentence.len()).find_map(|i| { - inner.engine.get_match(sentence, i).and_then(|graph| { - let start_group = graph.by_id(start_id); - let end_group = graph.by_id(end_id); + match inner.engine.get_match(sentence, i) { + Ok(graph) => graph.and_then(|graph| { + let start_group = graph.by_id(start_id); + let end_group = graph.by_id(end_id); - let start = start_group.span.char().start - sentence.span().char().start; - let end = end_group.span.char().end - sentence.span().char().start; + let start = start_group.span.char().start - sentence.span().char().start; + let end = end_group.span.char().end - sentence.span().char().start; - if inner.mask[start..end].iter().all(|x| !x) { - inner.mask[start..end].iter_mut().for_each(|x| *x = true); + if inner.mask[start..end].iter().all(|x| !x) { + inner.mask[start..end].iter_mut().for_each(|x| *x = true); - inner.index += 1; - Some(graph) - } else { - None - } - }) + inner.index += 1; + Some(Ok(graph)) + } else { + None + } + }), + Err(err) => Some(Err(err)), + } }), InnerMatches::Text(inner) => inner.captures.next().map(|captures| { let bi_to_ci = &inner.byte_idx_to_char_idx; @@ -134,13 +144,25 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { } } - MatchGraph::new(groups, inner.id_to_idx) + Ok(MatchGraph::new(groups, inner.id_to_idx)) }), } } } impl Engine { + pub fn compute_properties(&self) -> Properties { + match &self { + Engine::Token(engine) => engine + .antipatterns + .iter() + .map(|x| x.compute_properties()) + .chain(iter::once(engine.composition.compute_properties())) + .collect(), + Engine::Text(_, _) => Properties::default(), + } + } + pub fn get_matches<'a, 't>( &'a self, sentence: &'t MatchSentence, diff --git a/nlprule/src/rule/id.rs b/nlprule/src/rule/id.rs index 7c0c21b..850843f 100644 --- a/nlprule/src/rule/id.rs +++ b/nlprule/src/rule/id.rs @@ -17,11 +17,10 @@ //! Select individal rules: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules, rule::id::Category}; +//! use nlprule::{lang::en, rule::id::Category}; //! use std::convert::TryInto; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let mut rules = Rules::new("path/to/en_rules.bin")?; +//! let mut rules = en::rules(); //! //! // disable rules named "confusion_due_do" in category "confused_words" //! rules diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index 5be2342..ce10292 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -1,16 +1,17 @@ //! Implementations related to single rules. -use crate::types::*; use crate::{ filter::{Filter, Filterable}, - tokenizer::Tokenizer, + properties::*, + types::*, utils, }; use itertools::Itertools; -use log::{error, info, warn}; +use lazy_static::lazy_static; +use log::{debug, error}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; use std::fmt; +use std::{collections::HashSet, iter}; pub(crate) mod disambiguation; pub(crate) mod engine; @@ -38,7 +39,11 @@ pub(crate) struct Unification { } impl Unification { - pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> bool { + pub fn keep( + &self, + graph: &MatchGraph, + sentence: &MatchSentence, + ) -> Result { let filters: Vec<_> = self.filters.iter().multi_cartesian_product().collect(); let mut filter_mask: Vec<_> = filters.iter().map(|_| true).collect(); @@ -48,18 +53,22 @@ impl Unification { if maybe_mask_val.is_some() { for token in group.tokens(sentence) { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()); + *mask_val = + *mask_val && PosFilter::and(filter, sentence.guard().tags(token)?); } } } } let result = filter_mask.iter().any(|x| *x); - if negate { - !result - } else { - result + Ok(if negate { !result } else { result }) + } + + pub fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); } + *PROPERTIES } } @@ -126,21 +135,33 @@ impl Changes { } impl DisambiguationRule { + pub fn compute_properties(&self) -> PropertiesMut { + iter::once(self.engine.compute_properties()) + .chain(self.unification.iter().map(|x| x.compute_properties())) + .collect::() + .write(&[Property::Tags]) + } + /// Get a unique identifier of this rule. pub fn id(&self) -> &Index { &self.id } - pub(crate) fn apply<'t>(&'t self, sentence: &MatchSentence<'t>) -> Changes { + pub(crate) fn apply<'t>( + &'t self, + sentence: &MatchSentence<'t>, + ) -> Result { if matches!(self.disambiguations, disambiguation::Disambiguation::Nop) { - return Changes::default(); + return Ok(Changes::default()); } let mut all_spans = Vec::new(); for graph in self.engine.get_matches(sentence, self.start, self.end) { + let graph = graph?; + if let Some(unification) = &self.unification { - if !unification.keep(&graph, sentence) { + if !unification.keep(&graph, sentence)? { continue; } } @@ -165,11 +186,16 @@ impl DisambiguationRule { all_spans.push(spans); } - Changes(all_spans) + Ok(Changes(all_spans)) } - pub(crate) fn change<'t>(&'t self, sentence: &mut Sentence<'t>, changes: Changes) { - log::info!("applying {}", self.id); + pub(crate) fn change<'t>( + &'t self, + sentence: &mut Sentence<'t>, + changes: Changes, + guard: PropertyGuardMut, + ) -> Result<(), crate::properties::Error> { + debug!("applying {}", self.id); for spans in changes.0 { let mut groups = Vec::new(); @@ -185,43 +211,51 @@ impl DisambiguationRule { groups.push(group); } - self.disambiguations.apply(groups); + self.disambiguations.apply(groups, guard)?; } + + Ok(()) } /// Often there are examples associated with a rule. /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> bool { + pub(crate) fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { let mut passes = Vec::new(); - for (i, test) in self.examples.iter().enumerate() { + for test in self.examples.iter() { let text = match test { disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(), disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(), }; // by convention examples are always considered as one sentence even if the sentencizer would split - let sentence_before = tokenizer.disambiguate_up_to_id( - tokenizer - .tokenize(text) - .expect("test text must not be empty"), - Some(&self.id), - ); + let sentence_before = tokenizer + .tokenize_sentence(text) + .expect("test text must not be empty"); // shift the sentence to the right before matching to make sure // nothing assumes the sentene starts from absolute index zero let shift_delta = Position { byte: 1, char: 1 }; - let sentence_before_complete = sentence_before.clone().rshift(shift_delta); + let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); + + let guard = self + .compute_properties() + .build(&mut sentence_before_complete)?; + let changes = self - .apply(&MatchSentence::new(&sentence_before_complete)) + .apply(&MatchSentence::new( + &sentence_before_complete, + guard.downgrade(), + )) + .unwrap() .lshift(shift_delta); let mut sentence_after = sentence_before.clone(); if !changes.is_empty() { - self.change(&mut sentence_after, changes); + self.change(&mut sentence_after, changes, guard).unwrap(); } - info!("Tokens: {:#?}", sentence_before); + debug!("Tokens: {:#?}", sentence_before); let pass = match test { disambiguation::DisambiguationExample::Unchanged(_) => { @@ -238,38 +272,29 @@ impl DisambiguationRule { .find(|x| *x.span().char() == change.char_span) .unwrap(); - let unordered_tags = after.tags().iter().collect::>(); + let unordered_tags = + after.tags().unwrap().iter().collect::>(); let unordered_tags_change = change.after.iter().collect::>(); - let pass = unordered_tags == unordered_tags_change; - if !pass { - println!("{:#?} ---- {:#?}", unordered_tags, unordered_tags_change); - } - pass + unordered_tags == unordered_tags_change } }; if !pass { - let error_str = format!( + error!( "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.", - self.id, test, sentence_before, sentence_after, - ); - - if tokenizer - .lang_options() - .known_failures - .contains(&format!("{}:{}", self.id, i)) - { - warn!("{}", error_str) - } else { - error!("{}", error_str) - } + self.id, test, sentence_before, sentence_after + ) } passes.push(pass); } - passes.iter().all(|x| *x) + if passes.iter().all(|x| *x) { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } } } @@ -280,82 +305,92 @@ pub struct Suggestions<'a, 't> { sentence: &'t MatchSentence<'t>, } -impl<'a, 't> Iterator for Suggestions<'a, 't> { - type Item = Suggestion; - - fn next(&mut self) -> Option { - let rule = self.rule; - let sentence = self.sentence; - let (start, end) = (self.rule.start, self.rule.end); - - self.matches.find_map(|graph| { - if let Some(unification) = &rule.unification { - if !unification.keep(&graph, sentence) { - return None; - } +impl<'a, 't> Suggestions<'a, 't> { + fn suggest_from_graph( + graph: Result, + rule: &'a Rule, + sentence: &'t MatchSentence<'t>, + ) -> Result, crate::properties::Error> { + let graph = graph?; + + if let Some(unification) = &rule.unification { + if !unification.keep(&graph, sentence)? { + return Ok(None); } + } + + let start_group = graph.by_id(rule.start); + let end_group = graph.by_id(rule.end); - let start_group = graph.by_id(start); - let end_group = graph.by_id(end); + let replacements: Vec = rule + .suggesters + .iter() + .filter_map(|x| x.apply(sentence, &graph, rule.start, rule.end)) + .collect(); - let replacements: Vec = rule - .suggesters + let start = if replacements + .iter() + .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c))) + { + let first_token = graph.groups()[graph.get_index(rule.start)..] .iter() - .filter_map(|x| x.apply(sentence, &graph, start, end)) - .collect(); + .find_map(|x| x.tokens(sentence).next()) + .unwrap(); - let start = if replacements + let idx = sentence .iter() - .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c))) - { - let first_token = graph.groups()[graph.get_index(start)..] - .iter() - .find_map(|x| x.tokens(sentence).next()) - .unwrap(); - - let idx = sentence - .iter() - .position(|x| std::ptr::eq(x, first_token)) - .unwrap_or(0); - - if idx > 0 { - sentence.index(idx - 1).span().end() - } else { - start_group.span.start() - } + .position(|x| std::ptr::eq(x, first_token)) + .unwrap_or(0); + + if idx > 0 { + sentence.index(idx - 1).span().end() } else { start_group.span.start() - }; - let end = end_group.span.end(); - - // this should never happen, but just return None instead of raising an Error - // `end` COULD be equal to `start` if the suggestion is to insert text at this position - if end < start { - return None; } + } else { + start_group.span.start() + }; + let end = end_group.span.end(); + + // this should never happen, but just return None instead of raising an Error + // `end` COULD be equal to `start` if the suggestion is to insert text at this position + if end < start { + return Ok(None); + } - let text_before = sentence.slice(Span::from_positions(start, end)); + let text_before = sentence.slice(Span::from_positions(start, end)); + + // fix e. g. "Super , dass" + let replacements: Vec = replacements + .into_iter() + .filter(|suggestion| *suggestion != text_before) + .map(|x| utils::fix_nospace_chars(&x)) + .collect(); + + Ok(if !replacements.is_empty() { + Some(Suggestion::new( + rule.id.to_string(), + rule.message + .apply(sentence, &graph, rule.start, rule.end) + .expect("Rules must have a message."), + Span::from_positions(start, end), + replacements, + )) + } else { + None + }) + } +} - // fix e. g. "Super , dass" - let replacements: Vec = replacements - .into_iter() - .filter(|suggestion| *suggestion != text_before) - .map(|x| utils::fix_nospace_chars(&x)) - .collect(); +impl<'a, 't> Iterator for Suggestions<'a, 't> { + type Item = Result; - if !replacements.is_empty() { - Some(Suggestion::new( - rule.id.to_string(), - rule.message - .apply(sentence, &graph, rule.start, rule.end) - .expect("Rules must have a message."), - Span::from_positions(start, end), - replacements, - )) - } else { - None - } - }) + fn next(&mut self) -> Option { + let rule = self.rule; + let sentence = self.sentence; + + self.matches + .find_map(|graph| Suggestions::suggest_from_graph(graph, rule, sentence).transpose()) } } @@ -414,6 +449,12 @@ impl Rule { self.enabled } + pub fn compute_properties(&self) -> Properties { + iter::once(self.engine.compute_properties()) + .chain(self.unification.iter().map(|x| x.compute_properties())) + .collect() + } + /// Get a unique identifier of this rule. pub fn id(&self) -> &Index { &self.id @@ -459,7 +500,7 @@ impl Rule { /// Grammar rules always have at least one example associated with them. /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> bool { + pub(crate) fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { let mut passes = Vec::new(); // make sure relative position is handled correctly @@ -470,17 +511,17 @@ impl Rule { for test in self.examples.iter() { // by convention examples are always considered as one sentence even if the sentencizer would split let sentence = tokenizer - .disambiguate( - tokenizer - .tokenize(&test.text()) - .expect("test text must not be empty."), - ) + .tokenize_sentence(&test.text()) + .expect("test text must not be empty.") .rshift(shift_delta); - info!("Sentence: {:#?}", sentence); + debug!("Sentence: {:#?}", sentence); let suggestions: Vec<_> = self - .apply(&MatchSentence::new(&sentence)) - .map(|s| s.lshift(shift_delta)) + .apply(&MatchSentence::new( + &sentence, + self.compute_properties().build(&sentence)?, + )) + .map(|s| s.unwrap().lshift(shift_delta)) .collect(); let pass = if suggestions.len() > 1 { @@ -495,7 +536,7 @@ impl Rule { }; if !pass { - warn!( + error!( "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.", self.id, test.text(), @@ -507,6 +548,10 @@ impl Rule { passes.push(pass); } - passes.iter().all(|x| *x) + if passes.iter().all(|x| *x) { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } } } diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs deleted file mode 100644 index dc924fc..0000000 --- a/nlprule/src/rules.rs +++ /dev/null @@ -1,227 +0,0 @@ -//! Sets of grammatical error correction rules. - -use crate::types::*; -use crate::utils::parallelism::MaybeParallelRefIterator; -use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, tokenizer::Tokenizer, Error}; -use fs_err::File; -use serde::{Deserialize, Serialize}; -use std::{ - io::{BufReader, Read, Write}, - iter::FromIterator, - path::Path, -}; - -/// Language-dependent options for a rule set. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct RulesLangOptions { - /// Whether to allow errors while constructing the rules. - pub allow_errors: bool, - /// Grammar Rule selectors to use in this set. - #[serde(default)] - pub ids: Vec, - /// Grammar Rule selectors to ignore in this set. - #[serde(default)] - pub ignore_ids: Vec, -} - -impl Default for RulesLangOptions { - fn default() -> Self { - RulesLangOptions { - allow_errors: true, - ids: Vec::new(), - ignore_ids: Vec::new(), - } - } -} - -/// A set of grammatical error correction rules. -#[derive(Serialize, Deserialize, Default)] -pub struct Rules { - pub(crate) rules: Vec, -} - -impl Rules { - /// Creates a new rule set from a path to a binary. - /// - /// # Errors - /// - If the file can not be opened. - /// - If the file content can not be deserialized to a rules set. - pub fn new>(p: P) -> Result { - let reader = BufReader::new(File::open(p.as_ref())?); - let rules: Rules = bincode::deserialize_from(reader)?; - Ok(rules) - } - - /// Creates a new rules set from a reader. - pub fn from_reader(reader: R) -> Result { - Ok(bincode::deserialize_from(reader)?) - } - - /// Serializes this rules set to a writer. - pub fn to_writer(&self, writer: W) -> Result<(), Error> { - Ok(bincode::serialize_into(writer, &self)?) - } - - /// All rules ordered by priority. - pub fn rules(&self) -> &[Rule] { - &self.rules - } - - /// All rules ordered by priority (mutable). - pub fn rules_mut(&mut self) -> &mut [Rule] { - &mut self.rules - } - - /// Returns an iterator over all rules matching the selector. - pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> { - RulesIter { - inner: self.rules.iter(), - selector: Some(selector), - } - } - - /// Returns an iterator over all rules matching the selector (mutable). - pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> { - RulesIterMut { - inner: self.rules.iter_mut(), - selector: Some(selector), - } - } - - /// Compute the suggestions for the given sentence by checking all rules. - pub fn apply(&self, sentence: &Sentence) -> Vec { - let sentence = MatchSentence::new(sentence); - - let mut output: Vec<(usize, Suggestion)> = self - .rules - .maybe_par_iter() - .enumerate() - .filter(|(_, rule)| rule.enabled()) - .map(|(i, rule)| { - let mut output = Vec::new(); - - for suggestion in rule.apply(&sentence) { - output.push((i, suggestion)); - } - - output - }) - .flatten() - .collect(); - - output.sort_by(|(ia, a), (ib, b)| { - a.span() - .char() - .start - .cmp(&b.span().char().start) - .then_with(|| ib.cmp(ia)) - }); - - let mut mask = vec![false; sentence.text().chars().count()]; - - output - .into_iter() - .filter_map(|(_, suggestion)| { - let span = suggestion.span().clone().lshift(sentence.span().start()); - - if mask[span.char().clone()].iter().all(|x| !x) { - mask[span.char().clone()].iter_mut().for_each(|x| *x = true); - Some(suggestion) - } else { - None - } - }) - .collect() - } - - /// Compute the suggestions for a text by checking all rules. - pub fn suggest(&self, text: &str, tokenizer: &Tokenizer) -> Vec { - if text.is_empty() { - return Vec::new(); - } - - let mut suggestions = Vec::new(); - - // get suggestions sentence by sentence - for sentence in tokenizer.pipe(text) { - suggestions.extend(self.apply(&sentence)); - } - - suggestions - } - - /// Correct a text by first tokenizing, then finding all suggestions and choosing the first replacement of each suggestion. - pub fn correct(&self, text: &str, tokenizer: &Tokenizer) -> String { - let suggestions = self.suggest(text, tokenizer); - apply_suggestions(text, &suggestions) - } -} - -/// Correct a text by applying suggestions to it. -/// In the case of multiple possible replacements, always chooses the first one. -pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { - let mut offset: isize = 0; - let mut chars: Vec<_> = text.chars().collect(); - - for suggestion in suggestions { - let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); - chars.splice( - (suggestion.span().char().start as isize + offset) as usize - ..(suggestion.span().char().end as isize + offset) as usize, - replacement.iter().cloned(), - ); - offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; - } - - chars.into_iter().collect() -} - -/// An iterator over references to rules. -pub struct RulesIter<'a> { - selector: Option<&'a Selector>, - inner: std::slice::Iter<'a, Rule>, -} - -impl<'a> Iterator for RulesIter<'a> { - type Item = &'a Rule; - fn next(&mut self) -> Option { - let selector = self.selector.as_ref(); - - self.inner - .find(|rule| selector.map_or(true, |s| s.is_match(rule.id()))) - } -} - -/// An iterator over mutable references to rules. -pub struct RulesIterMut<'a> { - selector: Option<&'a Selector>, - inner: std::slice::IterMut<'a, Rule>, -} - -impl<'a> Iterator for RulesIterMut<'a> { - type Item = &'a mut Rule; - fn next(&mut self) -> Option { - let selector = self.selector.as_ref(); - - self.inner - .find(|rule| selector.map_or(true, |s| s.is_match(rule.id()))) - } -} - -impl IntoIterator for Rules { - type Item = Rule; - type IntoIter = std::vec::IntoIter; - fn into_iter(self) -> Self::IntoIter { - self.rules.into_iter() - } -} - -impl FromIterator for Rules -where - R: Into, -{ - fn from_iter>(iter: I) -> Self { - let rules: Vec = iter.into_iter().map(|x| x.into()).collect(); - Self { rules } - } -} diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs deleted file mode 100644 index 6076195..0000000 --- a/nlprule/src/tokenizer.rs +++ /dev/null @@ -1,396 +0,0 @@ -//! A tokenizer to split raw text into tokens. -//! Tokens are assigned lemmas and part-of-speech tags by lookup from a [Tagger][tag::Tagger] and chunks containing -//! information about noun / verb and grammatical case by a statistical [Chunker][chunk::Chunker]. -//! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by -//! [DisambiguationRule][crate::rule::DisambiguationRule]s. - -use crate::{ - rule::id::{Index, Selector}, - rule::MatchSentence, - types::*, - utils::{parallelism::MaybeParallelRefIterator, regex::Regex}, - Error, -}; -use fs_err::File; -use serde::{Deserialize, Serialize}; -use std::{ - io::{BufReader, Read, Write}, - ops::Range, - path::Path, - sync::Arc, -}; - -pub mod chunk; -pub mod multiword; -pub mod tag; - -use chunk::Chunker; -use multiword::MultiwordTagger; -use tag::Tagger; - -use crate::rule::DisambiguationRule; - -/// Split a text at the points where the given function is true. -/// Keeps the separators. See https://stackoverflow.com/a/40296745. -fn split(text: &str, split_func: F) -> Vec<&str> -where - F: Fn(char) -> bool, -{ - let mut result = Vec::new(); - let mut last = 0; - for (index, matched) in text.match_indices(split_func) { - if last != index { - result.push(&text[last..index]); - } - result.push(matched); - last = index + matched.len(); - } - if last < text.len() { - result.push(&text[last..]); - } - - result -} - -/// Options for a tokenizer. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct TokenizerLangOptions { - /// Whether to allow errors while constructing the tokenizer. - pub allow_errors: bool, - /// Disambiguation Rule selectors to use in this tokenizer. - #[serde(default)] - pub ids: Vec, - /// Disambiguation Rule selectors to ignore in this tokenizer. - #[serde(default)] - pub ignore_ids: Vec, - /// Specific examples in the notation `{id}:{example_index}` which are known to fail. - #[serde(default)] - pub known_failures: Vec, - /// Extra language-specific characters to split text on. - #[serde(default)] - pub extra_split_chars: Vec, - /// Extra language-specific Regexes of which the matches will *not* be split into multiple tokens. - #[serde(default)] - pub extra_join_regexes: Vec, -} - -impl Default for TokenizerLangOptions { - fn default() -> Self { - TokenizerLangOptions { - allow_errors: false, - ids: Vec::new(), - ignore_ids: Vec::new(), - known_failures: Vec::new(), - extra_split_chars: Vec::new(), - extra_join_regexes: Vec::new(), - } - } -} - -/// An iterator over [IncompleteSentence]s. Has the same properties as [SentenceIter]. -pub struct IncompleteSentenceIter<'t> { - text: &'t str, - splits: Vec>, - tokenizer: &'t Tokenizer, - index: usize, - position: Position, -} - -impl<'t> Iterator for IncompleteSentenceIter<'t> { - type Item = Sentence<'t>; - - fn next(&mut self) -> Option { - if self.index == self.splits.len() { - return None; - } - - let mut range = self.splits[self.index].clone(); - self.index += 1; - - // as long as the current sentence contains only whitespace, add the next sentence - // in practice, this might never happen, but we can not make any assumption about - // SRX rule behavior here. - while self.text[range.clone()].trim().is_empty() && self.index < self.splits.len() { - range.end = self.splits[self.index].end; - self.index += 1; - } - - let sentence = self - .tokenizer - .tokenize(&self.text[range.clone()]) - .map(|x| x.rshift(self.position)); - - self.position += Position { - char: self.text[range.clone()].chars().count(), - byte: range.len(), - }; - - sentence - } -} - -/// An iterator over [Sentence]s. Has some key properties: -/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero. -/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`. -/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence. -pub struct SentenceIter<'t> { - inner: IncompleteSentenceIter<'t>, - tokenizer: &'t Tokenizer, -} - -impl<'t> Iterator for SentenceIter<'t> { - type Item = Sentence<'t>; - - fn next(&mut self) -> Option { - self.inner - .next() - .map(|sentence| self.tokenizer.disambiguate(sentence)) - } -} - -/// The complete Tokenizer doing tagging, chunking and disambiguation. -#[derive(Serialize, Deserialize, Default, Clone)] -pub struct Tokenizer { - pub(crate) rules: Vec, - pub(crate) chunker: Option, - pub(crate) sentencizer: srx::Rules, - pub(crate) multiword_tagger: Option, - pub(crate) tagger: Arc, - pub(crate) lang_options: TokenizerLangOptions, -} - -impl Tokenizer { - /// Creates a new tokenizer from a path to a binary. - /// - /// # Errors - /// - If the file can not be opened. - /// - If the file content can not be deserialized to a rules set. - pub fn new>(p: P) -> Result { - let reader = BufReader::new(File::open(p.as_ref())?); - Ok(bincode::deserialize_from(reader)?) - } - - /// Creates a new tokenizer from a reader. - pub fn from_reader(reader: R) -> Result { - Ok(bincode::deserialize_from(reader)?) - } - - /// Serializes this rules set to a writer. - pub fn to_writer(&self, writer: W) -> Result<(), Error> { - Ok(bincode::serialize_into(writer, &self)?) - } - - /// Gets all disambigation rules in the order they are applied. - pub fn rules(&self) -> &[DisambiguationRule] { - &self.rules - } - - /// Gets the lexical tagger. - pub fn tagger(&self) -> &Arc { - &self.tagger - } - - /// Gets the chunker if one exists. - pub fn chunker(&self) -> &Option { - &self.chunker - } - - pub(crate) fn lang_options(&self) -> &TokenizerLangOptions { - &self.lang_options - } - - pub(crate) fn disambiguate_up_to_id<'t>( - &'t self, - mut sentence: Sentence<'t>, - id: Option<&Index>, - ) -> Sentence<'t> { - let n = id.map_or(self.rules.len(), |id| { - self.rules.iter().position(|x| x.id == *id).unwrap() - }); - let mut i = 0; - - while i < n { - let match_sentence = MatchSentence::new(&sentence); - - let result = self.rules[i..n] - .maybe_par_iter() - .enumerate() - .filter_map(|(j, rule)| { - let changes = rule.apply(&match_sentence); - if changes.is_empty() { - None - } else { - Some((j + i, changes)) - } - }) - .find_first(|_| true); - - if let Some((index, changes)) = result { - self.rules[index].change(&mut sentence, changes); - i = index + 1; - } else { - i = n; - } - } - - sentence - } - - /// Apply rule-based disambiguation to the tokens. - /// This does not change the number of tokens, but can change the content arbitrarily. - pub fn disambiguate<'t>(&'t self, sentence: Sentence<'t>) -> Sentence<'t> { - self.disambiguate_up_to_id(sentence, None) - } - - fn get_token_ranges<'t>( - &self, - text: &'t str, - ) -> impl ExactSizeIterator> + 't + Clone { - let mut tokens = Vec::new(); - - let split_char = |c: char| c.is_whitespace() || crate::utils::splitting_chars().contains(c); - let split_text = |text: &'t str| { - let mut tokens = Vec::new(); - for pretoken in split(text, split_char) { - // if the token is in the dictionary, we add it right away - if self.tagger.id_word(pretoken.into()).1.is_some() { - tokens.push(pretoken); - } else { - // otherwise, potentially split it again with `extra_split_chars` e. g. "-" - tokens.extend(split(pretoken, |c| { - split_char(c) || self.lang_options.extra_split_chars.contains(&c) - })); - } - } - tokens - }; - - let mut joined_mask = vec![false; text.len()]; - let mut joins = Vec::new(); - - for regex in self.lang_options.extra_join_regexes.iter() { - for mat in regex.find_iter(text) { - if !joined_mask[mat.start()..mat.end()].iter().any(|x| *x) { - joins.push(mat.start()..mat.end()); - joined_mask[mat.start()..mat.end()] - .iter_mut() - .for_each(|x| *x = true); - } - } - } - - joins.sort_by(|a, b| a.start.cmp(&b.start)); - - let mut prev = 0; - for range in joins { - tokens.extend(split_text(&text[prev..range.start])); - prev = range.end; - tokens.push(&text[range]); - } - - tokens.extend(split_text(&text[prev..text.len()])); - tokens.into_iter().map(move |token| { - let byte_start = (token.as_ptr() as usize) - .checked_sub(text.as_ptr() as usize) - .expect("Each token str is a slice of the text str."); - - byte_start..byte_start + token.len() - }) - } - - /// Tokenize the given sentence. This applies chunking and tagging, but does not do disambiguation. - // NB: this is not public because it could be easily misused by passing a text instead of one sentence. - pub(crate) fn tokenize<'t>(&'t self, sentence: &'t str) -> Option> { - if sentence.trim().is_empty() { - return None; - } - - let token_strs = self - .get_token_ranges(sentence) - .filter(|range| !sentence[range.clone()].trim().is_empty()); - - let n_token_strs = token_strs.clone().count(); - - let tokens: Vec<_> = token_strs - .enumerate() - .map(|(i, range)| { - let byte_start = range.start; - let char_start = sentence[..byte_start].chars().count(); - - let token_text = sentence[range].trim(); - - let is_sentence_start = i == 0; - let is_sentence_end = i == n_token_strs - 1; - - let id = self.tagger.id_word(token_text.into()); - - let mut tag_vec: Vec<_> = self - .tagger - .get_tags_with_options( - token_text, - if is_sentence_start { Some(true) } else { None }, - None, - ) - .collect(); - - tag_vec.push( - WordData::new( - self.tagger().id_word(token_text.into()), - PosId::special(SpecialPos::None), - ) - .freeze(), - ); - - if is_sentence_end { - tag_vec.push( - WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)) - .freeze(), - ); - } - - Token::new( - id, - Tags::new(tag_vec), - Span::new( - byte_start..byte_start + token_text.len(), - char_start..char_start + token_text.chars().count(), - ), - is_sentence_end, - sentence[..byte_start].ends_with(char::is_whitespace), - Vec::new(), - ) - }) - .collect(); - - let mut sentence = Sentence::new(tokens, sentence, &self.tagger); - - if let Some(chunker) = &self.chunker { - chunker.apply(&mut sentence); - } - - if let Some(multiword_tagger) = &self.multiword_tagger { - multiword_tagger.apply(&mut sentence); - } - - Some(sentence) - } - - /// Splits the text into sentences and tokenizes each sentence. - pub fn sentencize<'t>(&'t self, text: &'t str) -> IncompleteSentenceIter<'t> { - IncompleteSentenceIter { - text, - splits: self.sentencizer.split_ranges(text), - tokenizer: &self, - index: 0, - position: Position::default(), - } - } - - /// Applies the entire tokenization pipeline including sentencization, tagging, chunking and disambiguation. - pub fn pipe<'t>(&'t self, text: &'t str) -> SentenceIter<'t> { - SentenceIter { - inner: self.sentencize(text), - tokenizer: &self, - } - } -} diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index d89809e..7f66716 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -1,8 +1,7 @@ //! Fundamental types used by this crate. -use crate::tokenizer::tag::Tagger; -pub use crate::tokenizer::tag::{PosId, WordId}; -pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt}; +pub(crate) use crate::components::tagger::{PosId, SpecialPos, WordId, WordIdInt}; +use crate::{components::tagger::Tagger, properties::Property}; use derivative::Derivative; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -69,6 +68,12 @@ impl<'t> Sentence<'t> { &self.tokens } + /// Gets the first token in this sentence. There is always at least one token in the sentence + /// so this will never panic. + pub fn first(&self) -> &Token<'t> { + &self.tokens[0] + } + /// Gets the amount of tokens in this sentence. pub fn len(&self) -> usize { self.tokens.len() @@ -177,15 +182,20 @@ impl<'a, 't> Iterator for TagIter<'a, 't> { /// Contains all the local information about a token i. e. /// the text itself and the [WordData]s associated with the word. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] pub struct Tags<'t> { + id: WordId<'t>, tags: Vec>, } impl<'t> Tags<'t> { /// Creates new [Tags]. - pub fn new(tags: Vec>) -> Self { - Tags { tags } + pub fn new(id: WordId<'t>, tags: Vec>) -> Self { + Tags { id, tags } + } + + pub fn id(&self) -> &WordId<'t> { + &self.id } /// Multiple pairs of (lemma, part-of-speech) associated with this token. @@ -221,59 +231,64 @@ impl<'t> Tags<'t> { /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. pub fn into_static(self) -> Tags<'static> { Tags { + id: self.id.into_static(), tags: self.tags.into_iter().map(|x| x.into_static()).collect(), } } } +lazy_static! { + pub(crate) static ref SENT_START: Token<'static> = Token { + text: "", + span: Span::default(), + is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. + is_sentence_end: false, + has_space_before: false, + tags: Some(Tags::new( + WordId::empty(), + vec![WordData::new( + WordId::empty(), + PosId::special(SpecialPos::SentStart), + )], + )), + chunks: Some(Vec::new()), + }; +} + /// A token where varying levels of information are set. #[derive(Debug, Clone, PartialEq)] pub struct Token<'t> { - text: WordId<'t>, - tags: Tags<'t>, + text: &'t str, span: Span, + is_sentence_start: bool, is_sentence_end: bool, has_space_before: bool, - chunks: Vec, + pub tags: Option>, + pub chunks: Option>, } impl<'t> Token<'t> { pub(crate) fn new( - text: WordId<'t>, - tags: Tags<'t>, + text: &'t str, span: Span, + is_sentence_start: bool, is_sentence_end: bool, has_space_before: bool, - chunks: Vec, ) -> Self { Token { text, - tags, span, + is_sentence_start, is_sentence_end, has_space_before, - chunks, + tags: None, + chunks: None, } } - /// Gets the word id for this token. - pub fn text(&self) -> &WordId<'t> { - &self.text - } - /// Gets the token as string. - pub fn as_str(&self) -> &str { - self.text.as_str() - } - - /// The tags of this token. Contain information about the part-of-speech tags and lemmas. - pub fn tags(&self) -> &Tags<'t> { - &self.tags - } - - #[allow(missing_docs)] - pub fn tags_mut(&mut self) -> &mut Tags<'t> { - &mut self.tags + pub fn as_str(&self) -> &'t str { + self.text } /// The span of this sentence. @@ -281,7 +296,12 @@ impl<'t> Token<'t> { &self.span } - /// Whether this token is the last token in the sentence- + /// Whether this token is the first token in the sentence. + pub fn is_sentence_start(&self) -> bool { + self.is_sentence_start + } + + /// Whether this token is the last token in the sentence. pub fn is_sentence_end(&self) -> bool { self.is_sentence_end } @@ -291,32 +311,37 @@ impl<'t> Token<'t> { self.has_space_before } - /// Chunks associated with this token. - pub fn chunks(&self) -> &[String] { - &self.chunks - } - - #[allow(missing_docs)] - pub fn chunks_mut(&mut self) -> &mut Vec { - &mut self.chunks - } - /// Shift the span of this token right by the specified amount. pub fn rshift(mut self, position: Position) -> Self { self.span = self.span.rshift(position); self } +} - /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. - pub fn into_static(self) -> Token<'static> { - Token { - text: self.text.into_static(), - tags: self.tags.into_static(), - span: self.span, - is_sentence_end: self.is_sentence_end, - has_space_before: self.has_space_before, - chunks: self.chunks, - } +impl<'t> Token<'t> { + /// The tags of this token. Contain information about the part-of-speech tags and lemmas. + pub fn tags(&self) -> Result<&Tags<'t>, crate::Error> { + self.tags + .as_ref() + .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into()) + } + + pub fn tags_mut(&mut self) -> Result<&mut Tags<'t>, crate::Error> { + self.tags + .as_mut() + .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into()) + } + + pub fn chunks(&self) -> Result<&[String], crate::Error> { + self.chunks + .as_deref() + .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into()) + } + + pub fn chunks_mut(&mut self) -> Result<&mut Vec, crate::Error> { + self.chunks + .as_mut() + .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into()) } } diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 7d08956..6408c96 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -1,20 +1,17 @@ use std::convert::TryInto; use lazy_static::lazy_static; -use nlprule::{rule::id::Category, types::Position, Rules, Tokenizer}; +use nlprule::{lang::en, properties::*, rule::id::Category, types::Position}; use quickcheck_macros::quickcheck; -const TOKENIZER_PATH: &str = "../storage/en_tokenizer.bin"; -const RULES_PATH: &str = "../storage/en_rules.bin"; - lazy_static! { - static ref TOKENIZER: Tokenizer = Tokenizer::new(TOKENIZER_PATH).unwrap(); - static ref RULES: Rules = Rules::new(RULES_PATH).unwrap(); + static ref ANALYZER: en::Analyzer = en::analyzer(); + static ref CORRECTER: en::Correcter = en::correcter(); } #[test] -fn can_tokenize_empty_text() { - let sentences: Vec<_> = TOKENIZER.pipe("").collect(); +fn can_analyze_empty_text() { + let sentences: Vec<_> = ANALYZER.tokenize("").collect(); assert!(sentences.is_empty()); } @@ -23,7 +20,7 @@ fn handles_whitespace_correctly() { // preceding whitespace has to be included, trailing whitespace behavior is unspecified let text = " hello.\ttest.\t\t"; - let mut sentences = TOKENIZER.pipe(text); + let mut sentences = ANALYZER.tokenize(text); assert_eq!( &text[sentences.next().unwrap().span().byte().clone()], " hello.\t" @@ -32,21 +29,21 @@ fn handles_whitespace_correctly() { &text[sentences.next().unwrap().span().byte().clone()], "test.\t" ); - assert_eq!(sentences.next(), None); + assert!(sentences.next().is_none()); } #[quickcheck] -fn can_tokenize_anything(text: String) -> bool { - let _: Vec<_> = TOKENIZER.pipe(&text).collect(); +fn can_analyze_anything(text: String) -> bool { + let _: Vec<_> = ANALYZER.tokenize(&text).collect(); true } #[test] fn suggest_indices_are_relative_to_input_text() { - let suggestions = RULES.suggest( - "I can due his homework for 10€. I can due his homework.", - &*TOKENIZER, - ); + let suggestions: Vec<_> = CORRECTER + .suggest("I can due his homework for 10€. I can due his homework.") + .flatten() + .collect(); assert_eq!(*suggestions[0].span().char(), 6..9); assert_eq!(*suggestions[0].span().byte(), 6..9); @@ -62,7 +59,7 @@ fn suggest_indices_are_relative_to_input_text() { fn sentence_spans_correct() { let text = "A short test. A test with emoji 😊."; - let sentences: Vec<_> = TOKENIZER.pipe(text).collect(); + let sentences: Vec<_> = ANALYZER.tokenize(text).collect(); assert_eq!(sentences.len(), 2); assert_eq!(*sentences[0].span().char(), 0..14); @@ -76,8 +73,8 @@ fn sentence_spans_correct() { fn token_spans_correct() { let text = "A short test. A test with emoji 😊."; - let tokens: Vec<_> = TOKENIZER - .pipe(text) + let tokens: Vec<_> = ANALYZER + .tokenize(text) .map(|x| x.into_iter()) .flatten() .collect(); @@ -99,7 +96,7 @@ fn no_gaps_between_sentences(text: String) { let mut prev_pos = Position::default(); let mut contains_sentence = false; - for sentence in TOKENIZER.pipe(&text) { + for sentence in ANALYZER.tokenize(&text) { assert_eq!(sentence.span().start(), prev_pos); prev_pos += sentence.span().len(); @@ -111,14 +108,18 @@ fn no_gaps_between_sentences(text: String) { #[test] fn rules_can_be_disabled_enabled() { - let mut rules = Rules::new(RULES_PATH).unwrap(); + let mut correcter = CORRECTER.clone(); // enabled by default - assert!(!rules - .suggest("I can due his homework", &*TOKENIZER) - .is_empty()); + assert!(correcter + .suggest("I can due his homework") + .flatten() + .next() + .is_some()); - rules + correcter + .components_mut() + .1 .select_mut( &Category::new("confused_words") .join("confusion_due_do") @@ -127,17 +128,28 @@ fn rules_can_be_disabled_enabled() { .for_each(|x| x.disable()); // disabled now - assert!(rules - .suggest("I can due his homework", &*TOKENIZER) - .is_empty()); + assert!(correcter + .suggest("I can due his homework") + .flatten() + .next() + .is_none()); // disabled by default - assert!(rules.suggest("I can not go", &*TOKENIZER).is_empty()); + assert!(correcter.suggest("I can not go").flatten().next().is_none()); - rules + correcter + .components_mut() + .1 .select_mut(&"typos/can_not".try_into().unwrap()) .for_each(|x| x.enable()); // enabled now - assert!(!rules.suggest("I can not go", &*TOKENIZER).is_empty()); + assert!(correcter.suggest("I can not go").flatten().next().is_some()); +} + +#[test] +fn pipelines_work_with_references() -> Result<(), crate::Error> { + let _pipeline = Pipeline::new((&*ANALYZER, &CORRECTER.components().1))?; + + Ok(()) } diff --git a/python/src/lib.rs b/python/src/lib.rs index 49d1e28..ccb310e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -182,6 +182,7 @@ impl PyToken { fn data(&self) -> Vec<(&str, &str)> { self.token .tags() + .unwrap() .iter() .map(|x| (x.lemma().as_str(), x.pos().as_str())) .collect() @@ -192,6 +193,7 @@ impl PyToken { let mut lemmas: Vec<_> = self .token .tags() + .unwrap() .iter() .filter_map(|x| { if x.lemma().as_str().is_empty() { @@ -211,6 +213,7 @@ impl PyToken { let mut tags: Vec<_> = self .token .tags() + .unwrap() .iter() .filter_map(|x| { if x.pos().as_str().is_empty() { @@ -227,7 +230,12 @@ impl PyToken { #[getter] fn chunks(&self) -> Vec<&str> { - self.token.chunks().iter().map(|x| x.as_str()).collect() + self.token + .chunks() + .unwrap() + .iter() + .map(|x| x.as_str()) + .collect() } } @@ -355,6 +363,7 @@ impl PyTokenizer { .pipe(&text) .map(|sentence| { sentence + .unwrap() .into_iter() .map(|token| PyCell::new(py, PyToken::from(token.into_static()))) .collect::>>() @@ -619,6 +628,7 @@ impl PyRules { self.rules .read() .suggest(&sentence, &tokenizer) + .unwrap() .into_iter() .map(|x| PyCell::new(py, PySuggestion::from(x))) .collect::>>() @@ -639,7 +649,7 @@ impl PyRules { let tokenizer = self.tokenizer.borrow(py); let tokenizer = tokenizer.tokenizer(); - Ok(self.rules.read().correct(&text, tokenizer)) + Ok(self.rules.read().correct(&text, tokenizer).unwrap()) }) } diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh index bb18ed8..5b28e3b 100755 --- a/scripts/build_and_test.sh +++ b/scripts/build_and_test.sh @@ -4,26 +4,35 @@ then exit fi -# this script assumes the build directories are in data/ -# only for convenience -mkdir -p storage +set -e -# x-- => only compile -# -xx => test_disambiguation and test -# xxx or flags not set => everything -flags=${2:-"xxx"} +mkdir -p nlprule/src/storage -if [ "${flags:0:1}" == "x" ] -then - RUST_LOG=INFO cargo run --all-features --bin compile -- --build-dir data/$1 --tokenizer-out storage/$1_tokenizer.bin --rules-out storage/$1_rules.bin +cd data + +# download + extract the build directory from backblaze if we don't have it yet +if [ ! -f $1.zip ]; then + wget https://f000.backblazeb2.com/file/nlprule/$1.zip + unzip -o $1.zip fi -if [ "${flags:1:1}" == "x" ] +cd .. + +# x- => only compile +# -x => only test +# xx or flags not set => everything +flags=${2:-"xx"} + +if [ "${flags:0:1}" == "x" ] then - RUST_LOG=WARN cargo run --all-features --bin test_disambiguation -- --tokenizer storage/$1_tokenizer.bin + cd nlprule + RUST_LOG=INFO cargo run --features "compile bin" --bin compile -- --build-dir ../data/$1 --out-dir storage/$1 + cd .. fi -if [ "${flags:2:1}" == "x" ] +if [ "${flags:1:1}" == "x" ] then - RUST_LOG=WARN cargo run --all-features --bin test -- --tokenizer storage/$1_tokenizer.bin --rules storage/$1_rules.bin + cd nlprule + RUST_LOG=INFO cargo run --no-default-features --features "bin binaries-$1 regex-all-test" --bin test_$1 + cd .. fi \ No newline at end of file