diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7800425..e3ac562 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-18.04
     strategy:
       matrix:
-        lang: ["en", "de", "es"] # TODO: load this from build/languages.txt
+        lang: ["en", "de", "es"]
     steps:
       - uses: actions/checkout@v2
       - uses: actions-rs/toolchain@v1
@@ -22,18 +22,9 @@ jobs:
           toolchain: stable
           target: wasm32-unknown-unknown
       - uses: Swatinem/rust-cache@v1
-      - run: |
-          mkdir data
-          mkdir storage
-
-          cd data
-          wget https://f000.backblazeb2.com/file/nlprule/${{ matrix.lang }}.zip
-          unzip ${{ matrix.lang }}.zip
-      - name: Build source
-        uses: actions-rs/cargo@v1
-        with:
-          command: build
-          args: --all-features
+      - name: Build and test language
+        run: |
+          bash scripts/build_and_test.sh ${{ matrix.lang }} xx
       - name: Build source (WebAssembly)
         uses: actions-rs/cargo@v1
         with:
@@ -44,55 +35,18 @@ jobs:
         if: matrix.lang == 'en'
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          args: --all-features
-      - name: Build binaries
-        uses: actions-rs/cargo@v1
-        env:
-          RUST_LOG: INFO
-        with:
-          command: run
-          args: --all-features --bin compile -- --build-dir data/${{ matrix.lang }} --tokenizer-out storage/${{ matrix.lang }}_tokenizer.bin --rules-out storage/${{ matrix.lang }}_rules.bin
+          args: --features "binaries-en compile bin regex-all-test"
       - name: Run nlprule tests
         uses: actions-rs/cargo@v1
         if: matrix.lang == 'en'
         with:
           command: test
-          args: --verbose --all-features --release
-      - name: Run disambiguation tests
-        uses: actions-rs/cargo@v1
-        env:
-            RUST_LOG: WARN
-        with:
-          command: run
-          args: --all-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin
-      - name: Run disambiguation tests (with regex-fancy backend)
-        uses: actions-rs/cargo@v1
-        if: matrix.lang == 'en'
-        env:
-            RUST_LOG: WARN
-        with:
-          command: run
-          args: --manifest-path nlprule/Cargo.toml --features "bin regex-onig" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin
-      - name: Run disambiguation tests (with regex-onig backend)
-        uses: actions-rs/cargo@v1
-        if: matrix.lang == 'en'
-        env:
-            RUST_LOG: WARN
-        with:
-          command: run
-          args: --manifest-path nlprule/Cargo.toml --features "bin regex-fancy" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin
-      - name: Run grammar rule tests
-        uses: actions-rs/cargo@v1
-        env:
-            RUST_LOG: WARN
-        with:
-          command: run
-          args: --all-features --bin test -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin --rules storage/${{ matrix.lang }}_rules.bin
+          args: --verbose --features "binaries-en" --release
       - name: Upload binaries as artifact
         uses: actions/upload-artifact@v2
         with:
           name: binaries
-          path: storage/*
+          path: nlprule/storage/*
 
   matrix_prep:
     runs-on: ubuntu-latest
@@ -107,145 +61,145 @@ jobs:
           # inputFile: '.github/workflows/matrix_includes.json' # Default input file path
           filter: '[?runOnEvent==`${{ github.event_name }}` || runOnEvent==`always`]'
 
-  python:
-    needs: [matrix_prep, rust]
-    strategy:      
-      matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}}
-    runs-on: ${{ matrix.os }}
-    container: ${{ matrix.container }}
-    env:
-      working-directory: python
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously)
-        with:
-          profile: minimal
-          toolchain: stable
-      - uses: Swatinem/rust-cache@v1
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
-        with:
-          python-version: ${{ matrix.python-version }}
-      - uses: actions/download-artifact@v2
-        with:
-          name: binaries
-          path: storage
-      - name: Install GSED (if needed) # needed by set_version.sh
-        if: matrix.os == 'macos-latest'
-        run: |
-          brew install gnu-sed
-      - name: Update version (if release)
-        if: github.event_name == 'release'
-        run: |
-          bash scripts/set_version.sh ${{ github.event.release.tag_name }}
-      - name: Build and Test
-        run: |
-          # pybin is the directory with python binaries
-          PYBIN=${{ matrix.pybin }}
+  # python:
+  #   needs: [matrix_prep, rust]
+  #   strategy:      
+  #     matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}}
+  #   runs-on: ${{ matrix.os }}
+  #   container: ${{ matrix.container }}
+  #   env:
+  #     working-directory: python
+  #   steps:
+  #     - uses: actions/checkout@v2
+  #     - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously)
+  #       with:
+  #         profile: minimal
+  #         toolchain: stable
+  #     - uses: Swatinem/rust-cache@v1
+  #     - name: Set up Python ${{ matrix.python-version }}
+  #       uses: actions/setup-python@v1
+  #       with:
+  #         python-version: ${{ matrix.python-version }}
+  #     - uses: actions/download-artifact@v2
+  #       with:
+  #         name: binaries
+  #         path: storage
+  #     - name: Install GSED (if needed) # needed by set_version.sh
+  #       if: matrix.os == 'macos-latest'
+  #       run: |
+  #         brew install gnu-sed
+  #     - name: Update version (if release)
+  #       if: github.event_name == 'release'
+  #       run: |
+  #         bash scripts/set_version.sh ${{ github.event.release.tag_name }}
+  #     - name: Build and Test
+  #       run: |
+  #         # pybin is the directory with python binaries
+  #         PYBIN=${{ matrix.pybin }}
                     
-          if [ -z "${PYBIN}" ]; then
-              PIP_CMD="python -m pip"
-              PYTHON_CMD="python"
-              PYTEST_CMD="python -m pytest"
-              export MATURIN_CMD="maturin"
-          else 
-              PIP_CMD="${PYBIN}/pip"
-              PYTHON_CMD="${PYBIN}/python"
-              PYTEST_CMD="${PYBIN}/pytest"
-              export MATURIN_CMD="${PYBIN}/maturin"
-          fi
+  #         if [ -z "${PYBIN}" ]; then
+  #             PIP_CMD="python -m pip"
+  #             PYTHON_CMD="python"
+  #             PYTEST_CMD="python -m pytest"
+  #             export MATURIN_CMD="maturin"
+  #         else 
+  #             PIP_CMD="${PYBIN}/pip"
+  #             PYTHON_CMD="${PYBIN}/python"
+  #             PYTEST_CMD="${PYBIN}/pytest"
+  #             export MATURIN_CMD="${PYBIN}/maturin"
+  #         fi
 
-          # if pybin is set, the venv will not be used
-          # still create it here for convenience since we need it on windows
-          ${PYTHON_CMD} -m venv venv
-          . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS
-          ${PIP_CMD} install --upgrade pip
-          ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2
+  #         # if pybin is set, the venv will not be used
+  #         # still create it here for convenience since we need it on windows
+  #         ${PYTHON_CMD} -m venv venv
+  #         . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS
+  #         ${PIP_CMD} install --upgrade pip
+  #         ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2
           
-          # remove potentially cached wheels
-          rm target/wheels/* || true
-          bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014
+  #         # remove potentially cached wheels
+  #         rm target/wheels/* || true
+  #         bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014
 
-          # install the wheel in two different ways:
-          # 1. via pip: needed on manylinux
-          # 2. via maturin develop: needed on windows in venv
-          ${PIP_CMD} install $(ls target/wheels/* | head -n1)
-          bash scripts/maturin.sh develop --release
+  #         # install the wheel in two different ways:
+  #         # 1. via pip: needed on manylinux
+  #         # 2. via maturin develop: needed on windows in venv
+  #         ${PIP_CMD} install $(ls target/wheels/* | head -n1)
+  #         bash scripts/maturin.sh develop --release
 
-          ${PYTEST_CMD} python/test.py -s
-        shell: bash
-      - name: Upload wheel as artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: python-wheel
-          path: target/wheels/*
+  #         ${PYTEST_CMD} python/test.py -s
+  #       shell: bash
+  #     - name: Upload wheel as artifact
+  #       uses: actions/upload-artifact@v2
+  #       with:
+  #         name: python-wheel
+  #         path: target/wheels/*
 
-  publish:
-    runs-on: ubuntu-latest
-    needs: [rust, python]
-    if: github.event_name == 'release'
+  # publish:
+  #   runs-on: ubuntu-latest
+  #   needs: [rust, python]
+  #   if: github.event_name == 'release'
 
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          ref: ${{ github.head_ref }}
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v1
-        with:
-          python-version: 3.8
-      - uses: actions/download-artifact@v2
-        with:
-          name: python-wheel
-          path: python-wheel
-      - uses: actions/download-artifact@v2
-        with:
-          name: binaries
-          path: storage
-      - run: |
-          gzip storage/en_tokenizer.bin
-          gzip storage/en_rules.bin
-          gzip storage/de_tokenizer.bin
-          gzip storage/de_rules.bin
-          gzip storage/es_tokenizer.bin
-          gzip storage/es_rules.bin
-      - name: Update version
-        run: |
-          bash scripts/set_version.sh ${{ github.event.release.tag_name }}
-      - name: Publish on crates.io
-        run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out
-          cargo login $CARGO_KEY
+  #   steps:
+  #     - uses: actions/checkout@v2
+  #       with:
+  #         ref: ${{ github.head_ref }}
+  #     - name: Set up Python 3.8
+  #       uses: actions/setup-python@v1
+  #       with:
+  #         python-version: 3.8
+  #     - uses: actions/download-artifact@v2
+  #       with:
+  #         name: python-wheel
+  #         path: python-wheel
+  #     - uses: actions/download-artifact@v2
+  #       with:
+  #         name: binaries
+  #         path: storage
+  #     - run: |
+  #         gzip storage/en_tokenizer.bin
+  #         gzip storage/en_rules.bin
+  #         gzip storage/de_tokenizer.bin
+  #         gzip storage/de_rules.bin
+  #         gzip storage/es_tokenizer.bin
+  #         gzip storage/es_rules.bin
+  #     - name: Update version
+  #       run: |
+  #         bash scripts/set_version.sh ${{ github.event.release.tag_name }}
+  #     - name: Publish on crates.io
+  #       run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out
+  #         cargo login $CARGO_KEY
 
-          cd nlprule
-          cp ../README.md README.md
-          cargo publish --allow-dirty
-          rm README.md
-          cd ..
+  #         cd nlprule
+  #         cp ../README.md README.md
+  #         cargo publish --allow-dirty
+  #         rm README.md
+  #         cd ..
 
-          # allow crates.io index to update s. t. nlprule-build can depend on nlprule
-          sleep 1m
+  #         # allow crates.io index to update s. t. nlprule-build can depend on nlprule
+  #         sleep 1m
 
-          cd build
-          cargo publish --allow-dirty
-          cd ..
-        env:
-          CARGO_KEY: ${{ secrets.CARGO_KEY }}
-      - name: Publish on PyPI
-        run: |
-          pip install twine==3.3
-          twine upload python-wheel/*
-        env:
-          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-      - name: Upload release binaries
-        uses: alexellis/upload-assets@0.2.2
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-        with:
-          asset_paths: '["./storage/*"]'
-      - run: |
-          rm -r python-wheel
-          rm -r storage
-      - uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-          commit_message: v${{ github.event.release.tag_name }}
-          branch: main
+  #         cd build
+  #         cargo publish --allow-dirty
+  #         cd ..
+  #       env:
+  #         CARGO_KEY: ${{ secrets.CARGO_KEY }}
+  #     - name: Publish on PyPI
+  #       run: |
+  #         pip install twine==3.3
+  #         twine upload python-wheel/*
+  #       env:
+  #         TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+  #         TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+  #     - name: Upload release binaries
+  #       uses: alexellis/upload-assets@0.2.2
+  #       env:
+  #         GITHUB_TOKEN: ${{ github.token }}
+  #       with:
+  #         asset_paths: '["./storage/*"]'
+  #     - run: |
+  #         rm -r python-wheel
+  #         rm -r storage
+  #     - uses: stefanzweifel/git-auto-commit-action@v4
+  #       with:
+  #         commit_message: v${{ github.event.release.tag_name }}
+  #         branch: main
diff --git a/Cargo.toml b/Cargo.toml
index a8efc09..d636835 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,6 @@
 [workspace]
 members = [
     "nlprule",
-    "build",
     "python"
 ]
 
diff --git a/build/Cargo.toml b/build/Cargo.toml
deleted file mode 100644
index 726af20..0000000
--- a/build/Cargo.toml
+++ /dev/null
@@ -1,26 +0,0 @@
-[package]
-name = "nlprule-build"
-version = "0.6.3"
-authors = ["Benjamin Minixhofer <bminixhofer@gmail.com>", "Bernhard Schuster <bernhard@ahoi.io>"]
-edition = "2018"
-license = "MIT OR Apache-2.0"
-description = "Build tools for a fast, low-resource Natural Language Processing and Error Correction library."
-repository = "https://github.com/bminixhofer/nlprule"
-keywords = ["text", "spelling", "language-processing", "nlp", "grammar"]
-categories = ["science", "text-processing"]
-
-[dependencies]
-flate2 = "1"
-thiserror = "1"
-zip = "0.5.9"
-directories = "3"
-reqwest = { version = "0.11", default_features = false, features = ["blocking", "rustls-tls"] }
-nlprule = { path = "../nlprule", features = ["compile"], version = "0.6.3" } # BUILD_BINDINGS_COMMENT
-# nlprule = { package = "nlprule-core", path = "../nlprule", features = ["compile"] } # BUILD_BINDINGS_UNCOMMENT
-fs-err = "2.5"
-
-[dev-dependencies]
-tempdir = "0.3"
-smush = "0.1.5"
-env_logger = "0.8"
-nlprule_030 = { package = "nlprule", version = "0.3.0" }
diff --git a/build/README.md b/build/README.md
index c826acc..db2c7ec 100644
--- a/build/README.md
+++ b/build/README.md
@@ -1,34 +1,6 @@
 # nlprule-build
 
-This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). It also provides:
-1. Utility functions to download the binaries from their distribution source.
-2. Scripts to create the nlprule build directories.
-
-## Development
-
-If you are using a development version of nlprule, the builder can build the binaries itself (instead of just fetching them):
-
-```rust
-let nlprule_builder = nlprule_build::BinaryBuilder::new(
-    &["en"],
-    std::env::var("OUT_DIR").expect("OUT_DIR is set when build.rs is running"),
-)
-// this specifies that the binaries should be built if they are not found
-.fallback_to_build_dir(true)
-.build()
-.validate();
-```
-
-In that case, you should set
-
-```toml
-[profile.dev]
-build-override = { opt-level = 2 }
-```
-
-in your `Cargo.toml`. Building can be slow otherwise.
-
-The following has information how to acquire the nlpruile build directories and how to build and test the nlprule binaries. As a user you will typically not need to do this.
+Utilities for creating build resources.
 
 ### Building and testing the nlprule binaries
 
diff --git a/build/languages.txt b/build/languages.txt
deleted file mode 100644
index f1723af..0000000
--- a/build/languages.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-de
-en
-es
\ No newline at end of file
diff --git a/build/src/lib.rs b/build/src/lib.rs
deleted file mode 100644
index 5dd85f8..0000000
--- a/build/src/lib.rs
+++ /dev/null
@@ -1,756 +0,0 @@
-//! This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule).
-//! See `README.md` for details.
-
-use flate2::bufread::GzDecoder;
-use fs::File;
-use fs_err as fs;
-use nlprule::{compile, rules_filename, tokenizer_filename};
-use std::fs::Permissions;
-use std::{
-    io::{self, BufReader, BufWriter, Cursor, Read},
-    path::{Path, PathBuf},
-    result,
-};
-use zip::result::ZipError;
-
-pub type OtherError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
-#[derive(Debug, thiserror::Error)]
-pub enum Error {
-    #[error(transparent)]
-    RequestError(#[from] reqwest::Error),
-    #[error("Binaries were not found on the remote")]
-    BinariesNotFound,
-    #[error("Failed to validate {1:?} binary for lang {0}")]
-    ValidationFailed(String, Binary, #[source] nlprule::Error),
-    #[error(transparent)]
-    IoError(#[from] io::Error),
-    #[error(transparent)]
-    ZipError(#[from] ZipError),
-    #[error("error postprocessing binaries: {0}")]
-    PostprocessingError(#[source] OtherError),
-    #[error("error transforming binaries: {0}")]
-    TransformError(#[source] OtherError),
-    #[error("Collation failed")]
-    CollationFailed(#[source] nlprule::compile::Error),
-}
-
-pub type Result<T> = result::Result<T, Error>;
-
-/// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer binaries.
-pub type TransformDataFn = Box<dyn Fn(&[u8], &mut Vec<u8>) -> result::Result<(), OtherError>>;
-
-/// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer binaries.
-pub type TransformPathFn = Box<dyn Fn(PathBuf) -> result::Result<PathBuf, OtherError>>;
-
-#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
-pub enum Binary {
-    Tokenizer,
-    Rules,
-}
-
-impl Binary {
-    fn filename(&self, lang_code: &str) -> String {
-        match &self {
-            Binary::Tokenizer => tokenizer_filename(lang_code),
-            Binary::Rules => rules_filename(lang_code),
-        }
-    }
-}
-
-/// Tries downloading the binaries from their distribution source.
-///
-/// This implicitly unpacks the originally gzip'd sources and returns
-/// an in-memory buffer.
-fn obtain_binary_from_github_release(
-    version: &str,
-    lang_code: &str,
-    binary: Binary,
-) -> Result<Vec<u8>> {
-    let filename = binary.filename(lang_code);
-
-    let bytes = reqwest::blocking::get(&format!(
-        "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz",
-        version, filename
-    ))?
-    .error_for_status()
-    .map_err(|e| {
-        if let Some(404) = e.status().map(|x| x.as_u16()) {
-            Error::BinariesNotFound
-        } else {
-            e.into()
-        }
-    })?
-    .bytes()?;
-
-    let mut gz = GzDecoder::new(&bytes[..]);
-    let mut buffer = Vec::new();
-    gz.read_to_end(&mut buffer)?;
-
-    Ok(buffer)
-}
-
-fn construct_cache_path(
-    version: &str,
-    lang_code: &str,
-    binary: Binary,
-    cache_dir: Option<&PathBuf>,
-    transform_path_fn: Option<&TransformPathFn>,
-) -> Result<Option<PathBuf>> {
-    let filename = binary.filename(lang_code);
-
-    cache_dir
-        .map(move |dir| {
-            let path = dir.join(version).join(lang_code).join(&filename);
-            Ok(if let Some(transform_path_fn) = transform_path_fn {
-                transform_path_fn(path).map_err(Error::TransformError)?
-            } else {
-                path
-            })
-        })
-        .transpose()
-}
-
-/// Returns the bytes for a binary which are either obtained
-/// from the on-disk cache or from the distribution source.
-/// If the on-disk cache is disabled or is not present,
-/// it will attempt to download it via [`obtain_binary_from_github_release`].
-/// Also updates the cache.
-///
-/// If `transform_data_fn` is set, the bytes returned from this function are the output
-/// of `transform_data_fn` applied to the binencoded binaries.
-fn obtain_binary_cache_or_github(
-    version: &str,
-    lang_code: &str,
-    binary: Binary,
-    cache_dir: Option<&PathBuf>,
-    transform_path_fn: Option<&TransformPathFn>,
-    transform_data_fn: Option<&TransformDataFn>,
-) -> Result<Vec<u8>> {
-    let cache_path =
-        construct_cache_path(version, lang_code, binary, cache_dir, transform_path_fn)?;
-
-    // if the file can be read, the data is already cached and the transform was applied before
-    if let Some(ref cache_path) = cache_path {
-        if let Ok(bytes) = fs::read(cache_path) {
-            return Ok(bytes);
-        }
-    }
-
-    // the binencoded data from github
-    let bytes_binenc = obtain_binary_from_github_release(version, lang_code, binary)?;
-
-    // apply the transform if any to an intermediate buffer
-    let bytes_transformed = if let Some(transform_data_fn) = transform_data_fn {
-        let mut intermediate = Vec::<u8>::new();
-        transform_data_fn(bytes_binenc.as_slice(), &mut intermediate)
-            .map_err(Error::TransformError)?;
-        intermediate
-    } else {
-        bytes_binenc
-    };
-
-    // update the cache entry
-    if let Some(ref cache_path) = cache_path {
-        fs::create_dir_all(cache_path.parent().expect("path must have parent"))?;
-        let mut cache_file = fs::OpenOptions::new()
-            .truncate(true)
-            .create(true)
-            .write(true)
-            .open(cache_path)?;
-        io::copy(&mut bytes_transformed.as_slice(), &mut cache_file)?;
-    }
-
-    Ok(bytes_transformed)
-}
-
-fn assure_binary_availability(
-    version: &str,
-    lang_code: &str,
-    binary: Binary,
-    cache_dir: Option<&PathBuf>,
-    transform_path_fn: Option<&TransformPathFn>,
-    transform_data_fn: Option<&TransformDataFn>,
-    out: PathBuf,
-) -> Result<()> {
-    let source = obtain_binary_cache_or_github(
-        version,
-        lang_code,
-        binary,
-        cache_dir,
-        transform_path_fn,
-        transform_data_fn,
-    )?;
-
-    let mut out_file = fs::OpenOptions::new()
-        .truncate(true)
-        .create(true)
-        .write(true)
-        .open(out)?;
-    io::copy(&mut source.as_slice(), &mut out_file)?;
-    Ok(())
-}
-
-pub fn get_build_dir<P: AsRef<Path>>(lang_code: &str, out_dir: P) -> Result<()> {
-    let bytes = reqwest::blocking::get(&format!(
-        "https://f000.backblazeb2.com/file/nlprule/{}.zip",
-        lang_code
-    ))?
-    .error_for_status()?
-    .bytes()?;
-
-    // extract the zip file and write to directory, a bit annoying that this is so verbose
-    // adapted from https://github.com/zip-rs/zip/blob/master/examples/extract.rs
-    let mut archive = zip::ZipArchive::new(Cursor::new(bytes))?;
-
-    for i in 0..archive.len() {
-        let mut file = archive.by_index(i)?;
-        let outpath = match file.enclosed_name() {
-            Some(path) => out_dir
-                .as_ref()
-                // the first component of the path is the zip file name e. g. "en" so we skip it
-                .join(path.iter().skip(1).collect::<PathBuf>()),
-            None => continue,
-        };
-
-        if (&*file.name()).ends_with('/') {
-            fs::create_dir_all(&outpath)?;
-        } else {
-            if let Some(p) = outpath.parent() {
-                if !p.exists() {
-                    fs::create_dir_all(&p)?;
-                }
-            }
-            let mut outfile = fs::File::create(&outpath)?;
-            io::copy(&mut file, &mut outfile)?;
-        }
-
-        // Get and Set permissions
-        #[cfg(unix)]
-        {
-            use std::os::unix::fs::PermissionsExt;
-
-            if let Some(mode) = file.unix_mode() {
-                fs::set_permissions(&outpath, Permissions::from_mode(mode))?;
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// Gets the language codes for the currently supported languages in ISO 639-1 (two-letter) format e. g. "en".
-pub fn supported_language_codes() -> Vec<&'static str> {
-    include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/", "languages.txt"))
-        .lines()
-        .collect()
-}
-
-/// Places all nlprule binaries for the given languages in some directory.
-pub struct BinaryBuilder {
-    language_codes: Vec<String>,
-    out_dir: PathBuf,
-    version: String,
-    cache_dir: Option<PathBuf>,
-    fallback_to_build_dir: bool,
-    build_dir: Option<PathBuf>,
-    outputs: Vec<PathBuf>,
-    transform_path_fn: Option<TransformPathFn>,
-    transform_data_fn: Option<TransformDataFn>,
-}
-
-impl BinaryBuilder {
-    /// ```plain
-    /// github release resource --[fn transform]--> $cache_dir --[fn postprocess]--> $OUT_DIR/
-    /// ```
-    ///
-    /// Acquires the rule and tokenizer binaries for one language by:
-    /// - Trying to download them from their distribution source (or load them local cache).
-    /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir` is true
-    /// downloads the latest build directory and builds the binaries from it.
-    /// This can still fail if the dev version is sufficiently outdated for the latest build dir.
-    /// In that case, the user is encouraged to update to a release or a newer git sha.
-    fn build_language(&mut self, lang_code: &str) -> Result<()> {
-        // adjust the destination path
-        let path_transform = |out: PathBuf| -> Result<PathBuf> {
-            Ok(
-                if let Some(ref transform_path_fn) = self.transform_path_fn {
-                    transform_path_fn(out).map_err(Error::TransformError)?
-                } else {
-                    out
-                },
-            )
-        };
-
-        let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?;
-        let rules_out = path_transform(self.out_dir.join(rules_filename(lang_code)))?;
-
-        let mut did_not_find_binaries = false;
-
-        for (binary, out) in &[
-            (Binary::Tokenizer, &tokenizer_out),
-            (Binary::Rules, &rules_out),
-        ] {
-            let out = out.to_owned().to_owned();
-            match assure_binary_availability(
-                &self.version,
-                lang_code,
-                *binary,
-                self.cache_dir.as_ref(),
-                self.transform_path_fn.as_ref(),
-                self.transform_data_fn.as_ref(),
-                out,
-            ) {
-                Err(Error::BinariesNotFound) => {
-                    did_not_find_binaries = true;
-                    break;
-                }
-                res => res?,
-            }
-        }
-
-        if did_not_find_binaries && self.fallback_to_build_dir {
-            // it is possible that the build dirs are cached too long i. e. not downloaded again although a new version is available
-            // this could lead to problems but is not easy to fix so it will stay this way unless problems are reported
-            let build_dir = self
-                .build_dir.as_ref()
-                .unwrap_or_else(
-                    || self.cache_dir.as_ref().expect("need somewhere to store build dirs: either `cache_dir` or `build_dir_path` must be set if `fallback_to_build_dir` is true."),
-                )
-                .join(lang_code);
-
-            if !build_dir.exists() {
-                get_build_dir(lang_code, &build_dir).expect("error loading build directory");
-            }
-
-            let mut rules_sink = BufWriter::new(
-                fs::OpenOptions::new()
-                    .truncate(true)
-                    .create(true)
-                    .write(true)
-                    .open(&rules_out)?,
-            );
-            let mut tokenizer_sink = BufWriter::new(
-                fs::OpenOptions::new()
-                    .truncate(true)
-                    .create(true)
-                    .write(true)
-                    .open(&tokenizer_out)?,
-            );
-            if let Some(ref transform_data_fn) = self.transform_data_fn {
-                let mut transfer_buffer_rules = Vec::new();
-                let mut transfer_buffer_tokenizer = Vec::new();
-
-                compile::compile(
-                    build_dir,
-                    &mut transfer_buffer_rules,
-                    &mut transfer_buffer_tokenizer,
-                )
-                .map_err(Error::CollationFailed)?;
-
-                assert_ne!(transfer_buffer_rules.len(), 0);
-                assert_ne!(transfer_buffer_tokenizer.len(), 0);
-
-                let mut transformed_buffer_rules = Vec::new();
-                let mut transformed_buffer_tokenizer = Vec::new();
-
-                transform_data_fn(
-                    transfer_buffer_rules.as_slice(),
-                    &mut transformed_buffer_rules,
-                )
-                .map_err(Error::TransformError)?;
-                transform_data_fn(
-                    transfer_buffer_tokenizer.as_slice(),
-                    &mut transformed_buffer_tokenizer,
-                )
-                .map_err(Error::TransformError)?;
-            } else {
-                compile::compile(build_dir, &mut rules_sink, &mut tokenizer_sink)
-                    .map_err(Error::CollationFailed)?;
-            };
-        } else if did_not_find_binaries {
-            panic!(
-                "Did not find binaries for version {}. \
-                 If this is a development version, try setting `fallback_to_build_dir` to build the binaries yourself. \
-                 If this is a release, this should NOT happen.",
-                self.version
-            );
-        }
-
-        self.outputs.push(tokenizer_out);
-        self.outputs.push(rules_out);
-        Ok(())
-    }
-
-    /// Creates a new binary builder. `language_codes` must be in ISO 639-1 (two-letter) format.
-    /// If `language_codes` is `&[]`, uses all supported languages.
-    /// If this is used in a `build.rs`, `out_dir` should probably be the OUT_DIR environment variable.
-    pub fn new<P: AsRef<Path>>(language_codes: &[&str], out_dir: P) -> Self {
-        let language_codes: Vec<_> = if language_codes.is_empty() {
-            supported_language_codes()
-                .into_iter()
-                .map(ToOwned::to_owned)
-                .collect()
-        } else {
-            language_codes
-                .iter()
-                .map(ToOwned::to_owned)
-                .map(ToOwned::to_owned)
-                .collect::<Vec<String>>()
-        };
-
-        let project_dir = directories::ProjectDirs::from("", "", "nlprule");
-        // this should be CARGO_ARTIFACT_DIR once it is merged: https://github.com/rust-lang/rfcs/pull/3035
-        let cache_dir = project_dir.as_ref().map(|x| x.cache_dir().to_owned());
-        let build_dir = cache_dir.as_ref().map(|x| x.join("build_dirs"));
-
-        let version = env!("CARGO_PKG_VERSION").to_owned();
-
-        BinaryBuilder {
-            language_codes,
-            out_dir: out_dir.as_ref().to_owned(),
-            version,
-            cache_dir,
-            fallback_to_build_dir: false,
-            build_dir,
-            outputs: Vec::new(),
-            transform_data_fn: None,
-            transform_path_fn: None,
-        }
-    }
-
-    /// Sets the version for which to fetch binaries.
-    /// The version of `nlprule-build` (kept in sync with `nlprule` version) by default.
-    /// Typically does not need to be modified.
-    pub fn version<S: Into<String>>(mut self, version: S) -> Self {
-        self.version = version.into();
-        self
-    }
-
-    /// Sets the out directory.
-    pub fn out_dir(mut self, out_dir: PathBuf) -> Self {
-        self.out_dir = out_dir;
-        self
-    }
-
-    /// Sets the cache directory. The user cache directory at e. g. `~/.cache/nlprule` by default.
-    pub fn cache_dir(mut self, cache_dir: Option<PathBuf>) -> Self {
-        self.cache_dir = cache_dir;
-        self
-    }
-
-    /// Sets whether to fallback to building from the build directory if no distributed binaries are found
-    /// (i. e. a development version of nlprule is used).
-    pub fn fallback_to_build_dir(mut self, fallback_to_build_dir: bool) -> Self {
-        self.fallback_to_build_dir = fallback_to_build_dir;
-        self
-    }
-
-    /// Sets the path the build directories should be stored at.
-    /// Only relevant if `fallback_to_build_dir` is true.
-    /// `cache_dir.join("build_dirs")` by default.
-    pub fn build_dir(mut self, build_dir: Option<PathBuf>) -> Self {
-        self.build_dir = build_dir;
-        self
-    }
-
-    /// Builds by {downloading, copying, building} the binaries to the out directory.
-    pub fn build(mut self) -> Result<Self> {
-        self.language_codes
-            .clone()
-            .into_iter()
-            .try_for_each(|lang_code| self.build_language(&lang_code))?;
-        Ok(self)
-    }
-
-    /// Validates the binaries by checking if they can be loaded by nlprule.
-    pub fn validate(&self) -> Result<()> {
-        for lang_code in &self.language_codes {
-            let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code));
-            let rules_out = self.out_dir.join(rules_filename(lang_code));
-
-            nlprule::Rules::new(rules_out)
-                .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?;
-            nlprule::Tokenizer::new(tokenizer_out)
-                .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?;
-        }
-
-        Ok(())
-    }
-
-    /// Gets the paths to all files this builder created.
-    pub fn outputs(&self) -> &[PathBuf] {
-        &self.outputs
-    }
-
-    /// Applies the given transformation function to the binary immediately after obtaining it.
-    /// This happens before placing the file in the cache (if any) so by using a compression
-    /// function the size of the cache directory can be reduced.
-    /// Modifies the path of the cached binaries by the given `path_fn`.
-    /// If no cache directory is set or the binaries are built from the build dir, the `path_fn` does nothing.
-    ///
-    /// The resulting files will then reside in the given cache dir if any.
-    ///
-    /// Attention: Any compression applied here, must be undone in the
-    /// `fn postprocess` provided closure to retain the original binenc file
-    /// to be consumed by the application code.
-    pub fn transform<D, P>(mut self, proc_fn: D, path_fn: P) -> Self
-    where
-        // these signatures have to match the `TransformDataFn` and `TransformPathFn` types
-        D: Fn(&[u8], &mut Vec<u8>) -> result::Result<(), OtherError> + 'static,
-        P: Fn(PathBuf) -> result::Result<PathBuf, OtherError> + 'static,
-    {
-        self.transform_data_fn = Some(Box::new(proc_fn));
-        self.transform_path_fn = Some(Box::new(path_fn));
-        self
-    }
-
-    /// Applies the given postprocessing function to the binaries e. g. for compression.
-    /// Modifies the output path by the given path function.
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// # use nlprule_build::BinaryBuilder;
-    /// # use std::io::Write;
-    /// # let tempdir = tempdir::TempDir::new("builder_test")?;
-    /// # let tempdir = tempdir.path();
-    /// #
-    /// # let mut builder = BinaryBuilder::new(&["en"], tempdir).version("0.3.0");
-    /// builder
-    ///    .build()?
-    ///    .postprocess(
-    ///        |reader, mut writer| {
-    ///            let mut encoder = flate2::read::GzEncoder::new(reader, flate2::Compression::default());
-    ///            std::io::copy(&mut encoder, &mut writer)?;
-    ///            Ok(())
-    ///        },
-    ///        |p| {
-    ///            let mut path = p.as_os_str().to_os_string();
-    ///            path.push(".gz");
-    ///            path
-    ///        },
-    ///    )?;
-    /// # Ok::<(), nlprule_build::Error>(())
-    /// ```
-    pub fn postprocess<F, C, P>(mut self, proc_fn: C, path_fn: F) -> Result<Self>
-    where
-        C: Fn(BufReader<File>, BufWriter<File>) -> result::Result<(), OtherError>,
-        F: Fn(PathBuf) -> P,
-        P: AsRef<Path>,
-    {
-        for (i, path) in self.outputs.clone().into_iter().enumerate() {
-            let reader = BufReader::new(fs::File::open(&path)?);
-
-            let new_path = path_fn(path.clone());
-            let new_path = new_path.as_ref();
-
-            let writer = BufWriter::new(File::create(new_path)?);
-
-            proc_fn(reader, writer).map_err(Error::PostprocessingError)?;
-
-            if new_path != path {
-                self.outputs[i] = new_path.to_path_buf();
-                fs::remove_file(path)?;
-            }
-        }
-
-        Ok(self)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use io::Write;
-
-    use super::*;
-
-    #[test]
-    fn getting_binary_works() -> Result<()> {
-        // this is nice to keep roughly in sync with the latest released version but it is not necessary
-        let tempdir = tempdir::TempDir::new("build_dir")?;
-        let tempdir = tempdir.path().join("foo.bin");
-        assure_binary_availability("0.3.0", "en", Binary::Rules, None, None, None, tempdir)?;
-
-        Ok(())
-    }
-
-    #[test]
-    fn getting_build_dir_works() -> Result<()> {
-        let _ = env_logger::builder().is_test(true).try_init();
-
-        let tempdir = tempdir::TempDir::new("build_dir_test")?;
-        let tempdir = tempdir.path();
-
-        get_build_dir("en", &tempdir)?;
-
-        assert_eq!(fs::read_to_string(tempdir.join("lang_code.txt"))?, "en");
-
-        Ok(())
-    }
-
-    // TODO: causes problems in CI, maybe remove `fallback_to_build_dir` altogether?
-    // #[test]
-    // fn binary_builder_works() -> Result<()> {
-    //     let tempdir = tempdir::TempDir::new("builder_test")?;
-    //     let tempdir = tempdir.path();
-
-    //     BinaryBuilder::new(&["en"], tempdir)
-    //         .cache_dir(Some(tempdir.to_path_buf()))
-    //         .fallback_to_build_dir(true)
-    //         .build()?
-    //         .validate()?;
-
-    //     Ok(())
-    // }
-
-    #[test]
-    fn binary_builder_works_with_released_version() -> Result<()> {
-        let tempdir = tempdir::TempDir::new("builder_test")?;
-        let tempdir = tempdir.path();
-
-        BinaryBuilder::new(&["en"], tempdir)
-            .version("0.3.0")
-            .build()?;
-
-        Ok(())
-    }
-
-    #[test]
-    fn binary_builder_works_with_smush() -> Result<()> {
-        let tempdir = tempdir::TempDir::new("builder_test")?;
-        let tempdir = tempdir.path();
-
-        BinaryBuilder::new(&["en"], tempdir)
-            .version("0.3.0")
-            .build()?
-            .postprocess(
-                |mut buffer, mut writer| {
-                    let mut tmp = Vec::new();
-                    buffer.read_to_end(&mut tmp)?;
-                    Ok(writer.write_all(&smush::encode(
-                        &tmp,
-                        smush::Codec::Gzip,
-                        smush::Quality::Default,
-                    )?)?)
-                },
-                |p| {
-                    let mut path = p.as_os_str().to_os_string();
-                    path.push(".gz");
-                    path
-                },
-            )?;
-
-        let tokenizer_path = tempdir
-            .join(Path::new(&tokenizer_filename("en")))
-            .with_extension("bin.gz");
-        assert!(tokenizer_path.exists());
-        let decoded = smush::decode(&fs::read(tokenizer_path)?, smush::Codec::Gzip).unwrap();
-
-        let _ = nlprule_030::Tokenizer::new_from(&mut decoded.as_slice()).unwrap();
-
-        Ok(())
-    }
-
-    #[test]
-    fn binary_builder_works_with_flate2() -> Result<()> {
-        let tempdir = tempdir::TempDir::new("builder_test")?;
-        let tempdir = tempdir.path();
-
-        let builder = BinaryBuilder::new(&["en"], tempdir)
-            .version("0.3.0")
-            .build()?
-            .postprocess(
-                |mut buffer, writer| {
-                    let mut tmp = Vec::new();
-                    buffer.read_to_end(&mut tmp)?;
-                    Ok(
-                        flate2::write::GzEncoder::new(writer, flate2::Compression::default())
-                            .write_all(&tmp)?,
-                    )
-                },
-                |p| {
-                    let mut path = p.as_os_str().to_os_string();
-                    path.push(".gz");
-                    path
-                },
-            )?;
-
-        assert_eq!(
-            builder.outputs(),
-            &[
-                tempdir.join("en_tokenizer.bin.gz"),
-                tempdir.join("en_rules.bin.gz")
-            ]
-        );
-
-        let rules_path = tempdir
-            .join(Path::new(&rules_filename("en")))
-            .with_extension("bin.gz");
-        assert!(rules_path.exists());
-
-        let encoded = fs::read(rules_path)?;
-        let mut decoder = flate2::read::GzDecoder::new(&encoded[..]);
-
-        let mut decoded = Vec::new();
-        decoder.read_to_end(&mut decoded).unwrap();
-
-        let _ = nlprule_030::Rules::new_from(&mut decoded.as_slice()).unwrap();
-
-        Ok(())
-    }
-
-    #[test]
-    fn build_with_zstd_transform() -> Result<()> {
-        let tempdir = tempdir::TempDir::new("builder_test")?;
-        let tempdir = tempdir.path();
-
-        let builder = BinaryBuilder::new(&["en"], tempdir)
-            .version("0.3.0")
-            .transform(
-                |buffer, writer| {
-                    let data = smush::encode(buffer, smush::Codec::Zstd, smush::Quality::Maximum)?;
-                    writer.write_all(&data)?;
-                    Ok(())
-                },
-                |p: PathBuf| {
-                    let mut s = p.to_string_lossy().to_string();
-                    s.push_str(".zstd");
-                    Ok(PathBuf::from(s))
-                },
-            )
-            .build()?
-            .postprocess(
-                |mut buffer, mut writer| {
-                    let mut tmp = Vec::new();
-                    buffer.read_to_end(&mut tmp)?;
-                    let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?;
-                    writer.write_all(data.as_slice())?;
-                    Ok(())
-                },
-                |p| {
-                    let path = p.to_string_lossy();
-                    assert!(path.ends_with(".zstd"));
-                    let end = path.len().saturating_sub(".zstd".len());
-                    assert_ne!(end, 0);
-                    path[..end].to_owned()
-                },
-            )?;
-
-        assert_eq!(
-            builder.outputs(),
-            &[
-                tempdir.join("en_tokenizer.bin"),
-                tempdir.join("en_rules.bin")
-            ]
-        );
-
-        let rules_path = tempdir
-            .join(Path::new(&rules_filename("en")))
-            .with_extension("bin");
-        assert!(rules_path.is_file());
-
-        let _ = nlprule_030::Rules::new(rules_path).unwrap();
-        Ok(())
-    }
-}
diff --git a/data/de/disambiguator_options.json b/data/de/disambiguator_options.json
new file mode 100644
index 0000000..92bc9fc
--- /dev/null
+++ b/data/de/disambiguator_options.json
@@ -0,0 +1,7 @@
+{
+    "allow_errors": false,
+    "ignore_ids": [
+        "DISAMBIGUATION/SUB_BEAMTE/1",
+        "DISAMBIGUATION/SUB_BEAMTE/2"
+    ]
+}
\ No newline at end of file
diff --git a/nlprule/configs/de/rules.json b/data/de/rules_options.json
similarity index 100%
rename from nlprule/configs/de/rules.json
rename to data/de/rules_options.json
diff --git a/nlprule/configs/de/tagger.json b/data/de/tagger_options.json
similarity index 100%
rename from nlprule/configs/de/tagger.json
rename to data/de/tagger_options.json
diff --git a/nlprule/configs/en/tokenizer.json b/data/de/tokenizer_options.json
similarity index 75%
rename from nlprule/configs/en/tokenizer.json
rename to data/de/tokenizer_options.json
index 11eae18..0e8eb37 100644
--- a/nlprule/configs/en/tokenizer.json
+++ b/data/de/tokenizer_options.json
@@ -1,8 +1,4 @@
 {
-    "allow_errors": false,
-    "ignore_ids": [
-        "DISAMBIGUATION/BEST_JJS/0"
-    ],
     "extra_join_regexes": [
         "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})"
     ]
diff --git a/data/en/disambiguator_options.json b/data/en/disambiguator_options.json
new file mode 100644
index 0000000..e76a2cc
--- /dev/null
+++ b/data/en/disambiguator_options.json
@@ -0,0 +1,6 @@
+{
+    "allow_errors": false,
+    "ignore_ids": [
+        "DISAMBIGUATION/BEST_JJS/0"
+    ]
+}
\ No newline at end of file
diff --git a/nlprule/configs/en/rules.json b/data/en/rules_options.json
similarity index 100%
rename from nlprule/configs/en/rules.json
rename to data/en/rules_options.json
diff --git a/nlprule/configs/en/tagger.json b/data/en/tagger_options.json
similarity index 100%
rename from nlprule/configs/en/tagger.json
rename to data/en/tagger_options.json
diff --git a/nlprule/configs/de/tokenizer.json b/data/en/tokenizer_options.json
similarity index 67%
rename from nlprule/configs/de/tokenizer.json
rename to data/en/tokenizer_options.json
index df21ad8..0e8eb37 100644
--- a/nlprule/configs/de/tokenizer.json
+++ b/data/en/tokenizer_options.json
@@ -1,9 +1,4 @@
 {
-    "allow_errors": false,
-    "ignore_ids": [
-        "DISAMBIGUATION/SUB_BEAMTE/1",
-        "DISAMBIGUATION/SUB_BEAMTE/2"
-    ],
     "extra_join_regexes": [
         "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})"
     ]
diff --git a/data/es/disambiguator_options.json b/data/es/disambiguator_options.json
new file mode 100644
index 0000000..8305874
--- /dev/null
+++ b/data/es/disambiguator_options.json
@@ -0,0 +1,4 @@
+{
+    "allow_errors": false,
+    "ignore_ids": []
+}
\ No newline at end of file
diff --git a/nlprule/configs/es/rules.json b/data/es/rules_options.json
similarity index 100%
rename from nlprule/configs/es/rules.json
rename to data/es/rules_options.json
diff --git a/nlprule/configs/es/tagger.json b/data/es/tagger_options.json
similarity index 100%
rename from nlprule/configs/es/tagger.json
rename to data/es/tagger_options.json
diff --git a/nlprule/configs/es/tokenizer.json b/data/es/tokenizer_options.json
similarity index 89%
rename from nlprule/configs/es/tokenizer.json
rename to data/es/tokenizer_options.json
index 402aa9b..10cac1f 100644
--- a/nlprule/configs/es/tokenizer.json
+++ b/data/es/tokenizer_options.json
@@ -1,6 +1,4 @@
 {
-    "allow_errors": true,
-    "ignore_ids": [],
     "extra_split_chars": [
         "-",
         "─",
diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml
index ff6628c..36bce79 100644
--- a/nlprule/Cargo.toml
+++ b/nlprule/Cargo.toml
@@ -53,21 +53,22 @@ quickcheck = "1.0"
 quickcheck_macros = "1.0"
 criterion = "0.3"
 
-[build-dependencies]
-serde_json = "1"
-fs-err = "2.5"
-
 [[bench]]
 name = "load"
 harness = false
+required-features = ["binaries"]
 
 [features]
 default = ["regex-onig"]
 
+binaries-de = []
+binaries-en = []
+binaries-es = []
+binaries-all = ["binaries-de", "binaries-en", "binaries-es"]
+
 regex-onig = ["onig"]
 # to switch to the fancy-regex engine, disable default features and add this feature
 regex-fancy = ["fancy-regex"]
-
 # this enables both regex backends at the same time and makes sure they are equivalent
 # used only for compilation and tests
 regex-all-test = ["regex-onig", "regex-fancy"]
@@ -89,13 +90,17 @@ name = "compile"
 required-features = ["compile", "bin"]
 
 [[bin]]
-name = "test"
-required-features = ["bin"]
+name = "test_en"
+required-features = ["bin", "binaries-en"]
 
 [[bin]]
-name = "run"
-required-features = ["bin"]
+name = "test_es"
+required-features = ["bin", "binaries-es"]
 
 [[bin]]
-name = "test_disambiguation"
+name = "test_de"
+required-features = ["bin", "binaries-de"]
+
+[[bin]]
+name = "run"
 required-features = ["bin"]
diff --git a/nlprule/benches/load.rs b/nlprule/benches/load.rs
index 0e0c010..9dab5a8 100644
--- a/nlprule/benches/load.rs
+++ b/nlprule/benches/load.rs
@@ -1,17 +1,13 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use nlprule::{Rules, Tokenizer};
+use criterion::{criterion_group, criterion_main, Criterion};
+use nlprule::lang::en;
 use std::time::Duration;
 
 fn parse_tokenizer(c: &mut Criterion) {
-    c.bench_function("load tokenizer", |b| {
-        b.iter(|| Tokenizer::new(black_box("../storage/en_tokenizer.bin")).unwrap())
-    });
+    c.bench_function("load tokenizer", |b| b.iter(en::analyzer));
 }
 
 fn parse_rules(c: &mut Criterion) {
-    c.bench_function("load rules", |b| {
-        b.iter(|| Rules::new(black_box("../storage/en_rules.bin")).unwrap())
-    });
+    c.bench_function("load rules", |b| b.iter(en::rules));
 }
 
 fn no_warmup_criterion() -> Criterion {
diff --git a/nlprule/build.rs b/nlprule/build.rs
deleted file mode 100644
index 8eb2ad1..0000000
--- a/nlprule/build.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-//! Compiles the language build configurations in configs/ into two files (one for the tokenizer, one for the rules)
-//! so they can be inlined. These configs are included at compile time because they define the neccessary parameters to
-//! run the rules for a language correctly. They are NOT user configuration.
-
-use fs::File;
-use fs_err as fs;
-use std::{collections::HashMap, io::BufWriter, path::Path};
-
-fn main() {
-    let path = env!("CARGO_MANIFEST_DIR");
-    let path = Path::new(path).join("configs");
-
-    let out_dir =
-        std::env::var("OUT_DIR").expect("OUT_DIR env var must be set when build.rs is run");
-    let out_dir = Path::new(&out_dir);
-
-    println!("cargo:rerun-if-changed={}", path.display());
-
-    for (filename, joined_filename) in &[
-        ("tokenizer.json", "tokenizer_configs.json"),
-        ("rules.json", "rules_configs.json"),
-        ("tagger.json", "tagger_configs.json"),
-    ] {
-        let mut config_map: HashMap<String, serde_json::Value> = HashMap::new();
-
-        for entry in fs::read_dir(&path).expect("must be able to read config dir") {
-            let entry = entry.expect("must be able to read config dir entry");
-
-            println!("cargo:rerun-if-changed={}", entry.path().display());
-
-            if entry.path().is_dir() {
-                let lang_code = entry
-                    .path()
-                    .file_name()
-                    .expect("directory must have name")
-                    .to_str()
-                    .expect("directory name must be unicode")
-                    .to_string();
-
-                let path = entry.path().join(filename);
-
-                println!("cargo:rerun-if-changed={}", path.display());
-
-                let json_str = fs::read_to_string(path)
-                    .unwrap_or_else(|_| panic!("{} for 'lang_code' must exist", filename));
-
-                config_map.insert(
-                    lang_code,
-                    serde_json::from_str(&json_str)
-                        .unwrap_or_else(|_| panic!("{} for language must be valid json", filename)),
-                );
-            }
-        }
-
-        let config_writer = BufWriter::new(
-            File::create(out_dir.join(joined_filename))
-                .expect("must be able to create file in out dir"),
-        );
-        serde_json::to_writer_pretty(config_writer, &config_map)
-            .expect("must be able to write JSON to file");
-    }
-}
diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs
index 6f63412..f17cea4 100644
--- a/nlprule/src/bin/compile.rs
+++ b/nlprule/src/bin/compile.rs
@@ -1,7 +1,19 @@
 use clap::Clap;
+use fs::{File, OpenOptions};
 use fs_err as fs;
-use nlprule::compile::{compile, Error};
-use std::io::BufWriter;
+
+use log::{info, warn};
+use nlprule::compile::{BuildComponent, BuildInfo, Error};
+use nlprule::components::{
+    chunker::Chunker,
+    multiword_tagger::MultiwordTagger,
+    rules::{Disambiguator, Rules},
+    tagger::Tagger,
+    tokenizer::Tokenizer,
+    Component,
+};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
 use std::path::PathBuf;
 
 #[derive(clap::Clap)]
@@ -13,17 +25,96 @@ pub struct BuildOptions {
     #[clap(long, parse(from_os_str))]
     pub build_dir: PathBuf,
     #[clap(long, parse(from_os_str))]
-    pub tokenizer_out: PathBuf,
-    #[clap(long, parse(from_os_str))]
-    pub rules_out: PathBuf,
+    pub out_dir: PathBuf,
+}
+
+#[derive(Serialize, Deserialize)]
+struct BuildFilePaths {
+    lang_code: PathBuf,
+    tag_dict: Vec<PathBuf>,
+    tag_remove_dict: Vec<PathBuf>,
+    chunker: PathBuf,
+    disambiguator_xml: PathBuf,
+    rules_xml: PathBuf,
+    multiword_tags: PathBuf,
+    common_words: PathBuf,
+    regex_cache: PathBuf,
+    srx: PathBuf,
+    tagger_options: PathBuf,
+    rules_options: PathBuf,
+    tokenizer_options: PathBuf,
+    disambiguator_options: PathBuf,
+}
+
+impl BuildFilePaths {
+    fn new<P: AsRef<Path>>(build_dir: P) -> Self {
+        let p = build_dir.as_ref();
+        BuildFilePaths {
+            lang_code: p.join("lang_code.txt"),
+            tag_dict: vec![p.join("tags/output.dump"), p.join("tags/added.txt")],
+            tag_remove_dict: vec![p.join("tags/removed.txt")],
+            chunker: p.join("chunker.json"),
+            disambiguator_xml: p.join("disambiguation.xml"),
+            rules_xml: p.join("grammar.xml"),
+            multiword_tags: p.join("tags/multiwords.txt"),
+            common_words: p.join("common.txt"),
+            regex_cache: p.join("regex_cache.bin"),
+            srx: p.join("segment.srx"),
+            tagger_options: p.join("tagger_options.json"),
+            rules_options: p.join("rules_options.json"),
+            tokenizer_options: p.join("tokenizer_options.json"),
+            disambiguator_options: p.join("disambiguator_options.json"),
+        }
+    }
 }
 
 fn main() -> Result<(), Error> {
     env_logger::init();
     let opts = BuildOptions::parse();
+    let paths = BuildFilePaths::new(opts.build_dir);
+
+    fs::create_dir_all(&opts.out_dir)?;
+
+    let paths_value = serde_json::to_value(&paths)?;
+
+    let tagger = Tagger::build(serde_json::from_value(paths_value.clone())?, None)?;
+    let mut build_info = BuildInfo::new(&tagger, &paths.regex_cache)?;
+
+    macro_rules! build {
+        ($component:ty) => {
+            info!("Creating component \"{}\".", <$component>::name());
+            let instance_result = <$component>::build(
+                serde_json::from_value(paths_value.clone())?,
+                Some(&mut build_info),
+            );
+
+            match instance_result {
+                Ok(instance) => {
+                    instance.to_writer(
+                        &OpenOptions::new()
+                            .write(true)
+                            .create(true)
+                            .open(opts.out_dir.join(format!("{}.bin", <$component>::name())))?,
+                    )?;
+                }
+                Err(error) => {
+                    warn!("Error creating \"{0}\": {1}. This is expected if the component does not exist for this language.", <$component>::name(), error);
+                }
+            }
+        };
+    }
+
+    build!(Tokenizer);
+    build!(Disambiguator);
+    build!(MultiwordTagger);
+    build!(Chunker);
+    build!(Rules);
 
-    let tokenizer_sink = BufWriter::new(fs::File::create(&opts.tokenizer_out)?);
-    let rules_sink = BufWriter::new(fs::File::create(&opts.rules_out)?);
+    // write the regex cache at the end, otherwise it isn't fully populated
+    bincode::serialize_into(
+        &File::create(&paths.regex_cache)?,
+        build_info.mut_regex_cache(),
+    )?;
 
-    compile(opts.build_dir, rules_sink, tokenizer_sink)
+    Ok(())
 }
diff --git a/nlprule/src/bin/run.rs b/nlprule/src/bin/run.rs
index 1b4258c..d8c419d 100644
--- a/nlprule/src/bin/run.rs
+++ b/nlprule/src/bin/run.rs
@@ -1,28 +1,29 @@
-use clap::Clap;
-use nlprule::{rules::Rules, tokenizer::Tokenizer};
+// use clap::Clap;
+// use nlprule::{rules::Rules, tokenizer::Tokenizer};
 
-#[derive(Clap)]
-#[clap(
-    version = "1.0",
-    author = "Benjamin Minixhofer <bminixhofer@gmail.com>"
-)]
-struct Opts {
-    text: String,
-    #[clap(long, short)]
-    tokenizer: String,
-    #[clap(long, short)]
-    rules: String,
-}
+// #[derive(Clap)]
+// #[clap(
+//     version = "1.0",
+//     author = "Benjamin Minixhofer <bminixhofer@gmail.com>"
+// )]
+// struct Opts {
+//     text: String,
+//     #[clap(long, short)]
+//     tokenizer: String,
+//     #[clap(long, short)]
+//     rules: String,
+// }
 
-fn main() {
-    env_logger::init();
-    let opts = Opts::parse();
+fn main() {}
+// fn main() {
+//     env_logger::init();
+//     let opts = Opts::parse();
 
-    let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
-    let rules = Rules::new(opts.rules).unwrap();
+//     let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
+//     let rules = Rules::new(opts.rules).unwrap();
 
-    let tokens = tokenizer.pipe(&opts.text);
+//     let tokens = tokenizer.pipe(&opts.text);
 
-    println!("Tokens: {:#?}", tokens.collect::<Vec<_>>());
-    println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
-}
+//     println!("Tokens: {:#?}", tokens.collect::<Vec<_>>());
+//     println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
+// }
diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs
deleted file mode 100644
index 3669a8e..0000000
--- a/nlprule/src/bin/test.rs
+++ /dev/null
@@ -1,41 +0,0 @@
-use clap::Clap;
-use nlprule::{rules::Rules, tokenizer::Tokenizer};
-
-#[derive(Clap)]
-#[clap(
-    version = "1.0",
-    author = "Benjamin Minixhofer <bminixhofer@gmail.com>"
-)]
-struct Opts {
-    #[clap(long, short)]
-    tokenizer: String,
-    #[clap(long, short)]
-    rules: String,
-    #[clap(long, short)]
-    ids: Vec<String>,
-}
-
-fn main() {
-    env_logger::init();
-    let opts = Opts::parse();
-
-    let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
-    let rules_container = Rules::new(opts.rules).unwrap();
-    let rules = rules_container.rules();
-
-    println!("Runnable rules: {}", rules.len());
-
-    let mut passes = 0;
-    for rule in rules {
-        if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) {
-            passes += rule.test(&tokenizer) as usize;
-        }
-    }
-
-    println!("Rules passing tests: {}", passes);
-    if passes == rules.len() {
-        std::process::exit(0);
-    } else {
-        std::process::exit(1);
-    }
-}
diff --git a/nlprule/src/bin/test_de.rs b/nlprule/src/bin/test_de.rs
new file mode 100644
index 0000000..af75f27
--- /dev/null
+++ b/nlprule/src/bin/test_de.rs
@@ -0,0 +1,6 @@
+use nlprule::lang::de;
+
+fn main() -> Result<(), nlprule::Error> {
+    env_logger::init();
+    de::correcter().test()
+}
diff --git a/nlprule/src/bin/test_disambiguation.rs b/nlprule/src/bin/test_disambiguation.rs
deleted file mode 100644
index 30321a3..0000000
--- a/nlprule/src/bin/test_disambiguation.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-use clap::Clap;
-use nlprule::tokenizer::Tokenizer;
-
-#[derive(Clap)]
-#[clap(
-    version = "1.0",
-    author = "Benjamin Minixhofer <bminixhofer@gmail.com>"
-)]
-struct Opts {
-    #[clap(long)]
-    stop_at_error: bool,
-    #[clap(long, short)]
-    tokenizer: String,
-}
-
-fn main() {
-    env_logger::init();
-    let opts = Opts::parse();
-
-    let tokenizer = Tokenizer::new(opts.tokenizer).unwrap();
-    let rules = tokenizer.rules();
-
-    println!("Last ID: {}", rules[rules.len() - 1].id());
-    println!("Runnable rules: {}", rules.len());
-
-    let mut passes = 0;
-
-    for rule in rules {
-        if rule.test(&tokenizer) {
-            passes += 1;
-        } else if opts.stop_at_error {
-            break;
-        }
-    }
-
-    println!("Rules passing tests: {}", passes);
-    if passes == rules.len() {
-        std::process::exit(0);
-    } else {
-        std::process::exit(1);
-    }
-}
diff --git a/nlprule/src/bin/test_en.rs b/nlprule/src/bin/test_en.rs
new file mode 100644
index 0000000..f7268fc
--- /dev/null
+++ b/nlprule/src/bin/test_en.rs
@@ -0,0 +1,6 @@
+use nlprule::lang::en;
+
+fn main() -> Result<(), nlprule::Error> {
+    env_logger::init();
+    en::correcter().test()
+}
diff --git a/nlprule/src/bin/test_es.rs b/nlprule/src/bin/test_es.rs
new file mode 100644
index 0000000..c6c3ace
--- /dev/null
+++ b/nlprule/src/bin/test_es.rs
@@ -0,0 +1,6 @@
+use nlprule::lang::es;
+
+fn main() -> Result<(), nlprule::Error> {
+    env_logger::init();
+    es::correcter().test()
+}
diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs
deleted file mode 100644
index ec7fd7d..0000000
--- a/nlprule/src/compile/impls.rs
+++ /dev/null
@@ -1,822 +0,0 @@
-use bimap::BiMap;
-use fs_err::File;
-use log::warn;
-use serde::{Deserialize, Serialize};
-use std::{
-    collections::{HashMap, HashSet},
-    hash::{Hash, Hasher},
-    io::{self, BufRead, BufReader},
-    path::Path,
-};
-
-use crate::{
-    rule::{
-        disambiguation::PosFilter,
-        engine::{
-            composition::{GraphId, Matcher, PosMatcher, TextMatcher},
-            Engine,
-        },
-        id::Category,
-        DisambiguationRule, Rule,
-    },
-    rules::{Rules, RulesLangOptions},
-    tokenizer::{
-        chunk,
-        multiword::{MultiwordTagger, MultiwordTaggerFields},
-        tag::{Tagger, TaggerLangOptions, WordIdMap},
-        Tokenizer, TokenizerLangOptions,
-    },
-    types::*,
-    utils::{parallelism::MaybeParallelIterator, regex::Regex},
-};
-
-use super::{parse_structure::BuildInfo, Error};
-
-impl Tagger {
-    fn get_lines<S1: AsRef<Path>, S2: AsRef<Path>>(
-        paths: &[S1],
-        remove_paths: &[S2],
-    ) -> std::io::Result<Vec<(String, String, String)>> {
-        let mut output = Vec::new();
-        let mut disallowed: Vec<String> = Vec::new();
-
-        for path in remove_paths {
-            let file = File::open(path.as_ref())?;
-            let reader = std::io::BufReader::new(file);
-
-            for line in reader.lines() {
-                let line = line?;
-                if line.starts_with('#') {
-                    continue;
-                }
-
-                disallowed.push(line.to_string());
-            }
-        }
-
-        for path in paths {
-            let file = File::open(path.as_ref())?;
-            let reader = std::io::BufReader::new(file);
-
-            for line in reader.lines() {
-                let line = line?;
-                if line.starts_with('#') {
-                    continue;
-                }
-
-                if disallowed.contains(&line) {
-                    continue;
-                }
-
-                let parts: Vec<_> = line.split('\t').collect();
-
-                let word = parts[0].to_string();
-                let inflection = parts[1].to_string();
-                let tag = parts[2].to_string();
-
-                output.push((word, inflection, tag))
-            }
-        }
-
-        Ok(output)
-    }
-
-    /// Creates a tagger from raw files.
-    ///
-    /// # Arguments
-    /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively,
-    /// separated by tabs, to be added to the tagger.
-    /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively,
-    /// separated by tabs, to be removed from the tagger if present in the files from `paths`.
-    pub(in crate::compile) fn from_dumps<S1: AsRef<Path>, S2: AsRef<Path>>(
-        paths: &[S1],
-        remove_paths: &[S2],
-        common_words: &HashSet<String>,
-        lang_options: TaggerLangOptions,
-    ) -> std::io::Result<Self> {
-        let mut tag_store = HashSet::new();
-        let mut word_store = HashSet::new();
-
-        // add language specific special tags
-        tag_store.extend(lang_options.extra_tags.iter().map(|x| x.as_str()));
-
-        let lines = Tagger::get_lines(paths, remove_paths)?;
-
-        let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~";
-        for i in 0..punct.len() {
-            word_store.insert(&punct[i..(i + 1)]);
-        }
-
-        word_store.extend(common_words.iter().map(|x| x.as_str()));
-
-        for (word, inflection, tag) in lines.iter() {
-            word_store.insert(word);
-            word_store.insert(inflection);
-            tag_store.insert(tag);
-        }
-
-        // the empty string must not be part of any wordlist
-        assert!(!word_store.contains(""));
-
-        // word store ids should be consistent across runs
-        let mut word_store: Vec<_> = word_store.into_iter().collect();
-        word_store.sort_unstable();
-
-        // add special empty string to wordlist, must be the first element to have id 0
-        word_store.insert(0, "");
-
-        // tag store ids should be consistent across runs
-        let mut tag_store: Vec<_> = tag_store.into_iter().collect();
-        tag_store.sort_unstable();
-
-        // add special part of speech tags, they must have ids starting from zero
-        for (i, special_pos) in SpecialPos::iter().enumerate() {
-            tag_store.insert(i, special_pos);
-        }
-
-        let word_store: BiMap<_, _> = word_store
-            .iter()
-            .enumerate()
-            .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32)))
-            .collect();
-        let tag_store: BiMap<_, _> = tag_store
-            .iter()
-            .enumerate()
-            .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
-            .collect();
-
-        let mut tags: Vec<Option<Vec<(WordIdInt, PosIdInt)>>> = vec![None; word_store.len()];
-
-        for (word, inflection, tag) in lines.iter() {
-            let word_id = word_store.get_by_left(word).unwrap();
-            let lemma_id = word_store.get_by_left(inflection).unwrap();
-            let pos_id = tag_store.get_by_left(tag).unwrap();
-
-            match &mut tags[word_id.value() as usize] {
-                Some(vec) => {
-                    vec.push((*lemma_id, *pos_id));
-                }
-                None => {
-                    tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]);
-                }
-            }
-        }
-
-        Ok(Tagger {
-            tags: WordIdMap(tags),
-            word_store,
-            tag_store,
-            lang_options,
-        })
-    }
-}
-
-impl MultiwordTagger {
-    pub(in crate::compile) fn from_dump<P: AsRef<Path>>(
-        dump: P,
-        info: &BuildInfo,
-    ) -> Result<Self, io::Error> {
-        let reader = BufReader::new(File::open(dump.as_ref())?);
-        let mut multiwords = Vec::new();
-
-        for line in reader.lines() {
-            let line = line?;
-
-            // strip comments
-            let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim();
-            if line.is_empty() {
-                continue;
-            }
-            let tab_split: Vec<_> = line.split('\t').collect();
-
-            let word: String = tab_split[0]
-                .split_whitespace()
-                .collect::<Vec<_>>()
-                .join(" ");
-            let pos = info.tagger().id_tag(tab_split[1]).into_static();
-            multiwords.push((word, pos));
-        }
-
-        Ok((MultiwordTaggerFields { multiwords }).into())
-    }
-}
-
-impl TextMatcher {
-    pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
-        // can not cache a matcher that depends on the graph
-        let set = if matcher.graph_id().is_some() {
-            None
-        } else if let either::Right(regex) = &matcher.matcher {
-            let mut hasher = DefaultHasher::default();
-            regex.hash(&mut hasher);
-            matcher.negate.hash(&mut hasher);
-            matcher.empty_always_false.hash(&mut hasher);
-            let matcher_hash = hasher.finish();
-
-            if let Some(set) = info.mut_regex_cache().get(&matcher_hash) {
-                set.clone()
-            } else {
-                let data: Vec<_> = info.tagger().word_store().iter().collect();
-
-                let set: DefaultHashSet<_> = data
-                    .into_maybe_par_iter()
-                    .filter_map(|(word, id)| {
-                        if matcher.is_match(word.as_str(), None, None) {
-                            Some(*id)
-                        } else {
-                            None
-                        }
-                    })
-                    .collect();
-
-                // there are some regexes which match lots of strings
-                // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up
-                // the vast majority of regexes matches less than 100 strings from manual inspection
-                let set = if set.len() > 100 { None } else { Some(set) };
-                info.mut_regex_cache().insert(matcher_hash, set.clone());
-                set
-            }
-        } else {
-            None
-        };
-
-        TextMatcher { matcher, set }
-    }
-}
-
-impl PosMatcher {
-    pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
-        let mut mask = vec![false; info.tagger().tag_store().len()];
-
-        for (word, id) in info.tagger().tag_store().iter() {
-            mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None);
-        }
-
-        PosMatcher { mask }
-    }
-}
-
-impl Rules {
-    pub(in crate::compile) fn from_xml<P: AsRef<Path>>(
-        path: P,
-        build_info: &mut BuildInfo,
-        options: RulesLangOptions,
-    ) -> Self {
-        let rules = super::parse_structure::read_rules(path);
-        let mut errors: HashMap<String, usize> = HashMap::new();
-
-        let rules: Vec<_> = rules
-            .into_iter()
-            .filter_map(|x| match x {
-                Ok((rule_structure, group, category)) => {
-                    let category = category.expect("grammar rules must have category");
-                    let id = Category::new(category.id.as_str());
-
-                    let id = if let Some(group) = &group {
-                        id.join(group.id.as_str()).join(group.n)
-                    } else {
-                        id.join(
-                            rule_structure
-                                .id
-                                .as_ref()
-                                .expect("ID must be set if not in group."),
-                        )
-                        .join(0)
-                    };
-
-                    let rule_on = match rule_structure.default.as_deref() {
-                        Some("off") | Some("temp_off") => false,
-                        Some("on") | None => true,
-                        Some(x) => panic!("unknown `default` value: {}", x),
-                    };
-
-                    let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) {
-                        Some("off") | Some("temp_off") => false,
-                        Some("on") | None => true,
-                        Some(x) => panic!("unknown `default` value: {}", x),
-                    };
-
-                    let category_on = match category.default.as_deref() {
-                        Some("off") | Some("temp_off") => false,
-                        Some("on") | None => true,
-                        Some(x) => panic!("unknown `default` value: {}", x),
-                    };
-
-                    let name = rule_structure.name.as_ref().map_or_else(
-                        || {
-                            let group = group.as_ref().expect("must have group if name not set");
-                            group.name.clone()
-                        },
-                        |x| x.clone(),
-                    );
-
-                    match Rule::from_rule_structure(rule_structure, build_info) {
-                        Ok(mut rule) => {
-                            if (options.ids.is_empty()
-                                || options.ids.iter().any(|x| x.is_match(&id)))
-                                && !options.ignore_ids.iter().any(|x| x.is_match(&id))
-                            {
-                                rule.id = id;
-                                rule.name = name;
-                                rule.category_name = category.name;
-                                rule.category_type = category.kind;
-                                rule.enabled = category_on && group_on && rule_on;
-                                Some(rule)
-                            } else {
-                                None
-                            }
-                        }
-                        Err(x) => {
-                            *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1;
-                            None
-                        }
-                    }
-                }
-                Err(x) => {
-                    *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1;
-                    None
-                }
-            })
-            .collect();
-
-        if !errors.is_empty() {
-            let mut errors: Vec<(String, usize)> = errors.into_iter().collect();
-            errors.sort_by_key(|x| -(x.1 as i32));
-
-            warn!(
-                "Errors constructing Rules: {:#?}",
-                &errors
-                    .iter()
-                    .map(|(message, number)| format!("{} (n={})", message, number))
-                    .collect::<Vec<_>>()
-            );
-        }
-
-        Rules { rules }
-    }
-}
-
-impl Tokenizer {
-    pub(in crate::compile) fn from_xml<P: AsRef<Path>>(
-        path: P,
-        build_info: &mut BuildInfo,
-        chunker: Option<chunk::Chunker>,
-        multiword_tagger: Option<MultiwordTagger>,
-        sentencizer: srx::Rules,
-        lang_options: TokenizerLangOptions,
-    ) -> Result<Self, Error> {
-        let rules = super::parse_structure::read_disambiguation_rules(path);
-        let mut error = None;
-
-        let rules: Vec<_> = rules
-            .into_iter()
-            .filter_map(|x| match x {
-                Ok((rule_structure, group, _)) => {
-                    let id = Category::new("DISAMBIGUATION");
-
-                    let id = if let Some(group) = &group {
-                        id.join(group.id.as_str()).join(group.n)
-                    } else {
-                        id.join(
-                            rule_structure
-                                .id
-                                .as_ref()
-                                .expect("ID must be set if not in group."),
-                        )
-                        .join(0)
-                    };
-
-                    match DisambiguationRule::from_rule_structure(rule_structure, build_info) {
-                        Ok(mut rule) => {
-                            if error.is_none()
-                                && (lang_options.ids.is_empty()
-                                    || lang_options.ids.iter().any(|x| x.is_match(&id)))
-                                && !lang_options.ignore_ids.iter().any(|x| x.is_match(&id))
-                            {
-                                rule.id = id;
-
-                                Some(rule)
-                            } else {
-                                None
-                            }
-                        }
-                        Err(x) => {
-                            if error.is_none() {
-                                error = Some(format!("[Rule] {}", x));
-                            }
-                            None
-                        }
-                    }
-                }
-                Err(x) => {
-                    if error.is_none() {
-                        error = Some(format!("[Structure] {}", x));
-                    }
-                    None
-                }
-            })
-            .collect();
-
-        if let Some(x) = error {
-            if lang_options.allow_errors {
-                warn!("Error constructing Disambiguator: {}", x)
-            } else {
-                return Err(Error::Unexpected(format!(
-                    "Error constructing Disambiguator: {}",
-                    x
-                )));
-            }
-        }
-
-        Ok(Tokenizer {
-            tagger: build_info.tagger().clone(),
-            sentencizer,
-            chunker,
-            multiword_tagger,
-            rules,
-            lang_options,
-        })
-    }
-}
-
-#[derive(Deserialize)]
-struct ModelData {
-    outcome_labels: Vec<String>,
-    pmap: DefaultHashMap<String, ContextData>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub(in crate::compile) struct ContextData {
-    parameters: Vec<f32>,
-    outcomes: Vec<usize>,
-}
-
-impl From<ModelData> for chunk::Model {
-    fn from(data: ModelData) -> Self {
-        let mut outcomes: Vec<usize> = Vec::new();
-        let mut parameters: Vec<f32> = Vec::new();
-
-        let pmap = data
-            .pmap
-            .into_iter()
-            .map(|(key, value)| {
-                assert_eq!(value.outcomes.len(), value.parameters.len());
-
-                let offset = outcomes.len();
-                let length = value.outcomes.len();
-
-                outcomes.extend(value.outcomes);
-                parameters.extend(value.parameters);
-
-                (chunk::hash::hash_str(&key), (offset, length))
-            })
-            .collect::<DefaultHashMap<_, _>>();
-
-        chunk::Model {
-            outcome_labels: data.outcome_labels,
-            outcomes,
-            parameters,
-            pmap,
-        }
-    }
-}
-
-impl chunk::Chunker {
-    pub(in crate::compile) fn from_json<R: std::io::Read>(
-        reader: R,
-    ) -> Result<chunk::Chunker, serde_json::Error> {
-        #[derive(Deserialize)]
-        struct ChunkData {
-            token_model: ModelData,
-            pos_model: ModelData,
-            pos_tagdict: DefaultHashMap<String, Vec<String>>,
-            chunk_model: ModelData,
-        }
-
-        let chunk_data: ChunkData = serde_json::from_reader(reader)?;
-        Ok(chunk::Chunker {
-            token_model: chunk::MaxentTokenizer {
-                model: chunk_data.token_model.into(),
-            },
-            pos_model: chunk::MaxentPosTagger {
-                model: chunk_data.pos_model.into(),
-                tagdict: chunk_data.pos_tagdict,
-            },
-            chunk_model: chunk::MaxentChunker {
-                model: chunk_data.chunk_model.into(),
-            },
-        })
-    }
-}
-
-impl PosFilter {
-    pub(in crate::compile) fn new(matcher: PosMatcher) -> Self {
-        PosFilter { matcher }
-    }
-}
-
-impl Regex {
-    pub(in crate::compile) fn from_java_regex(
-        java_regex_str: &str,
-        full_match: bool,
-        case_sensitive: bool,
-    ) -> Result<Self, Error> {
-        let regex_string =
-            super::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?;
-
-        let regex = Regex::new(regex_string);
-        if let Err(error) = regex.try_compile() {
-            return Err(Error::Regex(error));
-        }
-
-        Ok(regex)
-    }
-}
-
-impl Engine {
-    pub(in crate::compile) fn to_graph_id(&self, id: usize) -> Result<GraphId, Error> {
-        let mut id = GraphId(id);
-
-        let map = match &self {
-            Engine::Token(engine) => &engine.composition.id_to_idx,
-            Engine::Text(_, id_to_idx) => &id_to_idx,
-        };
-
-        let max_id = *map
-            .keys()
-            .max()
-            .ok_or_else(|| Error::Unexpected("graph is empty".into()))?;
-
-        // ideally this should throw an error but LT is more lenient than nlprule
-        if !map.contains_key(&id) {
-            id = max_id;
-        }
-
-        Ok(id)
-    }
-}
-
-mod composition {
-    use super::*;
-    use crate::{
-        rule::engine::composition::{
-            AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part,
-            Quantifier, TrueAtom,
-        },
-        utils::regex::Regex,
-    };
-
-    impl Atom {
-        fn iter_mut<'a>(&'a mut self) -> Box<dyn Iterator<Item = &'a mut Atom> + 'a> {
-            match self {
-                Atom::ChunkAtom(_)
-                | Atom::SpaceBeforeAtom(_)
-                | Atom::TextAtom(_)
-                | Atom::WordDataAtom(_)
-                | Atom::FalseAtom(_)
-                | Atom::TrueAtom(_) => Box::new(std::iter::once(self)),
-                Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()),
-                Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()),
-                Atom::NotAtom(x) => x.atom.iter_mut(),
-                Atom::OffsetAtom(x) => x.atom.iter_mut(),
-            }
-        }
-
-        pub(in crate::compile) fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> {
-            let mut ids = Vec::new();
-
-            for atom in self.iter_mut() {
-                let id = match atom {
-                    Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(),
-                    Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(),
-                    Atom::WordDataAtom(atom) => atom
-                        .matcher
-                        .inflect_matcher
-                        .as_mut()
-                        .and_then(|x| x.matcher.mut_graph_id()),
-                    _ => {
-                        continue;
-                    }
-                };
-
-                if let Some(id) = id {
-                    ids.push(id);
-                }
-            }
-
-            ids
-        }
-    }
-
-    impl Matcher {
-        pub(in crate::compile) fn new_regex(
-            regex: Regex,
-            negate: bool,
-            empty_always_false: bool,
-        ) -> Self {
-            Matcher {
-                matcher: either::Right(regex),
-                negate,
-                case_sensitive: true, // handled by regex, should maybe be an option
-                empty_always_false,
-            }
-        }
-
-        pub(in crate::compile) fn new_string(
-            string_or_idx: either::Either<String, GraphId>,
-            negate: bool,
-            case_sensitive: bool,
-            empty_always_false: bool,
-        ) -> Self {
-            Matcher {
-                matcher: either::Left(string_or_idx),
-                negate,
-                case_sensitive,
-                empty_always_false,
-            }
-        }
-
-        pub(in crate::compile) fn graph_id(&self) -> Option<GraphId> {
-            if let either::Left(either::Right(id)) = &self.matcher {
-                Some(*id)
-            } else {
-                None
-            }
-        }
-
-        pub(in crate::compile) fn mut_graph_id(&mut self) -> Option<&mut GraphId> {
-            if let either::Left(either::Right(id)) = &mut self.matcher {
-                Some(id)
-            } else {
-                None
-            }
-        }
-    }
-
-    impl Quantifier {
-        pub(in crate::compile) fn new(min: usize, max: usize) -> Self {
-            assert!(max >= min);
-            Quantifier { min, max }
-        }
-    }
-
-    impl AndAtom {
-        pub(in crate::compile) fn and(atoms: Vec<Atom>) -> Atom {
-            let mut atoms: Vec<_> = atoms
-                .into_iter()
-                .filter(|x| !matches!(x, Atom::TrueAtom { .. }))
-                .collect();
-
-            if atoms.is_empty() {
-                (TrueAtom {}).into()
-            } else if atoms.len() == 1 {
-                atoms.remove(0)
-            } else {
-                (AndAtom { atoms }).into()
-            }
-        }
-    }
-
-    impl OrAtom {
-        pub(in crate::compile) fn or(atoms: Vec<Atom>) -> Atom {
-            let mut atoms: Vec<_> = atoms
-                .into_iter()
-                .filter(|x| !matches!(x, Atom::FalseAtom { .. }))
-                .collect();
-
-            if atoms.is_empty() {
-                (FalseAtom {}).into()
-            } else if atoms.len() == 1 {
-                atoms.remove(0)
-            } else {
-                (OrAtom { atoms }).into()
-            }
-        }
-    }
-
-    impl NotAtom {
-        pub(in crate::compile) fn not(atom: Atom) -> Atom {
-            match atom {
-                Atom::TrueAtom { .. } => FalseAtom::default().into(),
-                Atom::FalseAtom { .. } => TrueAtom::default().into(),
-                x => (NotAtom { atom: Box::new(x) }).into(),
-            }
-        }
-    }
-
-    impl OffsetAtom {
-        pub(in crate::compile) fn new(atom: Atom, offset: isize) -> Self {
-            OffsetAtom {
-                atom: Box::new(atom),
-                offset,
-            }
-        }
-    }
-
-    impl Composition {
-        pub(in crate::compile) fn new(mut parts: Vec<Part>) -> Result<Self, Error> {
-            let mut id_to_idx = DefaultHashMap::default();
-            id_to_idx.insert(GraphId(0), 0);
-            let mut current_id = 1;
-
-            for (i, part) in parts.iter().enumerate() {
-                if part.visible {
-                    id_to_idx.insert(GraphId(current_id), i + 1);
-                    current_id += 1;
-                }
-            }
-
-            let can_stop_mask = (0..parts.len())
-                .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0))
-                .collect();
-
-            for (i, part) in parts.iter_mut().enumerate() {
-                for id in part.atom.mut_graph_ids() {
-                    loop {
-                        let index = *id_to_idx.get(&id).ok_or_else(|| {
-                            Error::Unexpected(format!("id must exist in graph: {:?}", id))
-                        })?;
-
-                        // ideally this should throw an error but LT is more lenient than nlprule
-                        if index > i {
-                            *id = GraphId(id.0 - 1);
-                        } else {
-                            break;
-                        }
-                    }
-                }
-            }
-
-            Ok(Composition {
-                parts,
-                id_to_idx,
-                can_stop_mask,
-            })
-        }
-    }
-}
-
-pub(in crate::compile) mod filters {
-    use super::Error;
-    use std::collections::HashMap;
-
-    use crate::{filter::*, rule::engine::Engine, utils::regex::Regex};
-
-    trait FromArgs: Sized {
-        fn from_args(args: HashMap<String, String>, engine: &Engine) -> Result<Self, Error>;
-    }
-
-    impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter {
-        fn from_args(args: HashMap<String, String>, engine: &Engine) -> Result<Self, Error> {
-            if args.contains_key("negate_postag") {
-                panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter");
-            }
-
-            Ok(NoDisambiguationEnglishPartialPosTagFilter {
-                id: engine.to_graph_id(args
-                    .get("no")
-                    .ok_or_else(|| {
-                        Error::Unexpected(
-                            "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument"
-                                .into(),
-                        )
-                    })?
-                    .parse::<usize>()?)?,
-                regexp: Regex::from_java_regex(
-                    &args.get("regexp").ok_or_else(|| {
-                        Error::Unexpected(
-                        "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument"
-                            .into(),
-                    )
-                    })?,
-                    true,
-                    true,
-                )?,
-                postag_regexp: Regex::from_java_regex(
-                    &args.get("postag_regexp").ok_or_else(|| {
-                        Error::Unexpected(
-                        "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument"
-                            .into(),
-                    )
-                    })?,
-                    true,
-                    true,
-                )?,
-                negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"),
-            })
-        }
-    }
-
-    pub(in crate::compile) fn get_filter(
-        name: &str,
-        args: HashMap<String, String>,
-        engine: &Engine,
-    ) -> Result<Filter, Error> {
-        match name {
-            "NoDisambiguationEnglishPartialPosTagFilter" => {
-                Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into())
-            }
-            _ => Err(Error::Unexpected(format!("unsupported filter {}", name))),
-        }
-    }
-}
diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs
index c1258e1..965978c 100644
--- a/nlprule/src/compile/mod.rs
+++ b/nlprule/src/compile/mod.rs
@@ -1,62 +1,19 @@
-//! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md.
-
-use fs::File;
-use fs_err as fs;
-
 use std::{
     hash::{Hash, Hasher},
-    io::{self, BufReader, BufWriter},
+    io::BufReader,
     num::ParseIntError,
-    path::{Path, PathBuf},
-    str::FromStr,
-    sync::Arc,
+    path::Path,
 };
 
-use crate::{
-    rules::Rules,
-    tokenizer::{chunk::Chunker, multiword::MultiwordTagger, tag::Tagger, Tokenizer},
-    types::DefaultHasher,
-};
-use log::info;
-
-use self::parse_structure::{BuildInfo, RegexCache};
-use thiserror::Error;
+pub mod utils;
 
-mod impls;
-mod parse_structure;
-mod structure;
-mod utils;
-
-struct BuildFilePaths {
-    lang_code_path: PathBuf,
-    tag_paths: Vec<PathBuf>,
-    tag_remove_paths: Vec<PathBuf>,
-    chunker_path: PathBuf,
-    disambiguation_path: PathBuf,
-    grammar_path: PathBuf,
-    multiword_tag_path: PathBuf,
-    common_words_path: PathBuf,
-    regex_cache_path: PathBuf,
-    srx_path: PathBuf,
-}
+use crate::components::tagger::Tagger;
 
-impl BuildFilePaths {
-    fn new<P: AsRef<Path>>(build_dir: P) -> Self {
-        let p = build_dir.as_ref();
-        BuildFilePaths {
-            lang_code_path: p.join("lang_code.txt"),
-            tag_paths: vec![p.join("tags/output.dump"), p.join("tags/added.txt")],
-            tag_remove_paths: vec![p.join("tags/removed.txt")],
-            chunker_path: p.join("chunker.json"),
-            disambiguation_path: p.join("disambiguation.xml"),
-            grammar_path: p.join("grammar.xml"),
-            multiword_tag_path: p.join("tags/multiwords.txt"),
-            common_words_path: p.join("common.txt"),
-            regex_cache_path: p.join("regex_cache.bin"),
-            srx_path: p.join("segment.srx"),
-        }
-    }
-}
+use crate::types::*;
+use fs_err::File;
+use log::info;
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
+use thiserror::Error;
 
 #[derive(Error, Debug)]
 #[allow(missing_docs)]
@@ -71,8 +28,6 @@ pub enum Error {
     Json(#[from] serde_json::Error),
     #[error(transparent)]
     Srx(#[from] srx::Error),
-    #[error("language options do not exist for '{lang_code}'")]
-    LanguageOptionsDoNotExist { lang_code: String },
     #[error(transparent)]
     RegexSyntax(#[from] regex_syntax::ast::Error),
     #[error("regex compilation error: {0}")]
@@ -83,119 +38,90 @@ pub enum Error {
     Unimplemented(String),
     #[error(transparent)]
     ParseError(#[from] ParseIntError),
+    #[error("`BuildInfo` is required to build this component, but is unset.")]
+    BuildInfoUnset,
     #[error("unknown error: {0}")]
     Other(#[from] Box<dyn std::error::Error + Send + Sync + 'static>),
 }
 
-/// Compiles the binaries from a build directory.
-pub fn compile(
-    build_dir: impl AsRef<Path>,
-    rules_dest: impl io::Write,
-    tokenizer_dest: impl io::Write,
-) -> Result<(), Error> {
-    let paths = BuildFilePaths::new(&build_dir);
-
-    let lang_code = fs::read_to_string(paths.lang_code_path)?;
-
-    info!(
-        "Reading common words from {}.",
-        paths.common_words_path.display()
-    );
-    let common_words = fs::read_to_string(paths.common_words_path)?
-        .lines()
-        .map(|x| x.to_string())
-        .collect();
-
-    let tokenizer_lang_options = utils::tokenizer_lang_options(&lang_code).ok_or_else(|| {
-        Error::LanguageOptionsDoNotExist {
-            lang_code: lang_code.clone(),
+pub trait BuildComponent: Sized {
+    type Paths: DeserializeOwned;
+
+    fn build(paths: Self::Paths, build_info: Option<&mut BuildInfo>) -> Result<Self, Error>;
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct RegexCache {
+    cache: DefaultHashMap<u64, Option<DefaultHashSet<WordIdInt>>>,
+    // this is compared with the hash of the word store of the tagger
+    word_hash: u64,
+}
+
+impl RegexCache {
+    pub fn new(word_hash: u64) -> Self {
+        RegexCache {
+            cache: DefaultHashMap::default(),
+            word_hash,
         }
-    })?;
-
-    let rules_lang_options =
-        utils::rules_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist {
-            lang_code: lang_code.clone(),
-        })?;
-
-    let tagger_lang_options =
-        utils::tagger_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist {
-            lang_code: lang_code.clone(),
-        })?;
-
-    info!("Creating tagger.");
-    let tagger = Tagger::from_dumps(
-        &paths.tag_paths,
-        &paths.tag_remove_paths,
-        &common_words,
-        tagger_lang_options,
-    )?;
-
-    let mut hasher = DefaultHasher::default();
-    let mut word_store = tagger.word_store().iter().collect::<Vec<_>>();
-    word_store.sort_by(|a, b| a.1.cmp(b.1));
-    word_store.hash(&mut hasher);
-    let word_store_hash = hasher.finish();
-
-    let regex_cache = if let Ok(file) = File::open(&paths.regex_cache_path) {
-        let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?;
-        if *cache.word_hash() == word_store_hash {
+    }
+
+    pub fn word_hash(&self) -> &u64 {
+        &self.word_hash
+    }
+
+    pub(crate) fn get(&self, key: &u64) -> Option<&Option<DefaultHashSet<WordIdInt>>> {
+        self.cache.get(key)
+    }
+
+    pub(crate) fn insert(&mut self, key: u64, value: Option<DefaultHashSet<WordIdInt>>) {
+        self.cache.insert(key, value);
+    }
+}
+
+pub struct BuildInfo<'a> {
+    tagger: &'a Tagger,
+    regex_cache: RegexCache,
+}
+
+impl<'a> BuildInfo<'a> {
+    pub fn new<P: AsRef<Path>>(tagger: &'a Tagger, regex_cache_path: P) -> Result<Self, Error> {
+        let mut hasher = DefaultHasher::default();
+        let mut word_store = tagger.word_store().iter().collect::<Vec<_>>();
+        word_store.sort_by(|a, b| a.1.cmp(b.1));
+        word_store.hash(&mut hasher);
+        let word_store_hash = hasher.finish();
+
+        let regex_cache = if let Ok(file) = File::open(regex_cache_path.as_ref()) {
+            let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?;
+            if *cache.word_hash() == word_store_hash {
+                info!(
+                    "Regex cache at {} is valid.",
+                    regex_cache_path.as_ref().display()
+                );
+                cache
+            } else {
+                info!("Regex cache was provided but is not valid. Rebuilding.");
+                RegexCache::new(word_store_hash)
+            }
+        } else {
             info!(
-                "Regex cache at {} is valid.",
-                paths.regex_cache_path.display()
+                "No regex cache provided. Building and writing to {}.",
+                regex_cache_path.as_ref().display()
             );
-            cache
-        } else {
-            info!("Regex cache was provided but is not valid. Rebuilding.");
             RegexCache::new(word_store_hash)
-        }
-    } else {
-        info!(
-            "No regex cache provided. Building and writing to {}.",
-            paths.regex_cache_path.display()
-        );
-        RegexCache::new(word_store_hash)
-    };
-
-    let mut build_info = BuildInfo::new(Arc::new(tagger), regex_cache);
-    let chunker = if paths.chunker_path.exists() {
-        info!("{} exists. Building chunker.", paths.chunker_path.display());
-        let reader = BufReader::new(File::open(paths.chunker_path)?);
-        let chunker = Chunker::from_json(reader)?;
-        Some(chunker)
-    } else {
-        None
-    };
-    let multiword_tagger = if paths.multiword_tag_path.exists() {
-        info!(
-            "{} exists. Building multiword tagger.",
-            paths.multiword_tag_path.display()
-        );
-        Some(MultiwordTagger::from_dump(
-            paths.multiword_tag_path,
-            &build_info,
-        )?)
-    } else {
-        None
-    };
-
-    info!("Creating tokenizer.");
-    let tokenizer = Tokenizer::from_xml(
-        &paths.disambiguation_path,
-        &mut build_info,
-        chunker,
-        multiword_tagger,
-        srx::SRX::from_str(&fs::read_to_string(&paths.srx_path)?)?.language_rules(lang_code),
-        tokenizer_lang_options,
-    )?;
-    tokenizer.to_writer(tokenizer_dest)?;
-
-    info!("Creating grammar rules.");
-    let rules = Rules::from_xml(&paths.grammar_path, &mut build_info, rules_lang_options);
-    rules.to_writer(rules_dest)?;
-
-    // we need to write the regex cache after building the rules, otherwise it isn't fully populated
-    let f = BufWriter::new(File::create(&paths.regex_cache_path)?);
-    bincode::serialize_into(f, build_info.mut_regex_cache())?;
-
-    Ok(())
+        };
+
+        Ok(BuildInfo {
+            tagger,
+            regex_cache,
+        })
+    }
+
+    pub fn tagger(&self) -> &'a Tagger {
+        self.tagger
+    }
+
+    pub fn mut_regex_cache(&mut self) -> &mut RegexCache {
+        &mut self.regex_cache
+    }
 }
diff --git a/nlprule/src/compile/utils.rs b/nlprule/src/compile/utils.rs
index 73b5322..53dab59 100644
--- a/nlprule/src/compile/utils.rs
+++ b/nlprule/src/compile/utils.rs
@@ -1,55 +1,3 @@
-use crate::{rules::RulesLangOptions, tokenizer::TokenizerLangOptions};
-use crate::{tokenizer::tag::TaggerLangOptions, types::*};
-use lazy_static::lazy_static;
-
-lazy_static! {
-    static ref TOKENIZER_LANG_OPTIONS: DefaultHashMap<String, TokenizerLangOptions> = {
-        serde_json::from_slice(include_bytes!(concat!(
-            env!("OUT_DIR"),
-            "/",
-            "tokenizer_configs.json"
-        )))
-        .expect("tokenizer configs must be valid JSON")
-    };
-}
-
-lazy_static! {
-    static ref RULES_LANG_OPTIONS: DefaultHashMap<String, RulesLangOptions> = {
-        serde_json::from_slice(include_bytes!(concat!(
-            env!("OUT_DIR"),
-            "/",
-            "rules_configs.json"
-        )))
-        .expect("rules configs must be valid JSON")
-    };
-}
-
-lazy_static! {
-    static ref TAGGER_LANG_OPTIONS: DefaultHashMap<String, TaggerLangOptions> = {
-        serde_json::from_slice(include_bytes!(concat!(
-            env!("OUT_DIR"),
-            "/",
-            "tagger_configs.json"
-        )))
-        .expect("tagger configs must be valid JSON")
-    };
-}
-
-/// Gets the tokenizer language options for the language code
-pub(crate) fn tokenizer_lang_options(lang_code: &str) -> Option<TokenizerLangOptions> {
-    TOKENIZER_LANG_OPTIONS.get(lang_code).cloned()
-}
-
-/// Gets the rules language options for the language code
-pub(crate) fn rules_lang_options(lang_code: &str) -> Option<RulesLangOptions> {
-    RULES_LANG_OPTIONS.get(lang_code).cloned()
-}
-
-/// Gets the tagger language options for the language code
-pub(crate) fn tagger_lang_options(lang_code: &str) -> Option<TaggerLangOptions> {
-    TAGGER_LANG_OPTIONS.get(lang_code).cloned()
-}
-
 pub(crate) use regex::from_java_regex;
 
 mod regex {
diff --git a/nlprule/src/components/chunker/compile.rs b/nlprule/src/components/chunker/compile.rs
new file mode 100644
index 0000000..73083fa
--- /dev/null
+++ b/nlprule/src/components/chunker/compile.rs
@@ -0,0 +1,84 @@
+use std::{io::BufReader, path::PathBuf};
+
+use fs_err::File;
+use serde::Deserialize;
+
+use crate::compile::{BuildComponent, BuildInfo, Error};
+
+use super::*;
+
+#[derive(Serialize, Deserialize)]
+struct ContextData {
+    parameters: Vec<f32>,
+    outcomes: Vec<usize>,
+}
+
+#[derive(Deserialize)]
+struct ModelData {
+    outcome_labels: Vec<String>,
+    pmap: DefaultHashMap<String, ContextData>,
+}
+
+impl From<ModelData> for Model {
+    fn from(data: ModelData) -> Self {
+        let mut outcomes: Vec<usize> = Vec::new();
+        let mut parameters: Vec<f32> = Vec::new();
+
+        let pmap = data
+            .pmap
+            .into_iter()
+            .map(|(key, value)| {
+                assert_eq!(value.outcomes.len(), value.parameters.len());
+
+                let offset = outcomes.len();
+                let length = value.outcomes.len();
+
+                outcomes.extend(value.outcomes);
+                parameters.extend(value.parameters);
+
+                (hash::hash_str(&key), (offset, length))
+            })
+            .collect::<DefaultHashMap<_, _>>();
+
+        Model {
+            outcome_labels: data.outcome_labels,
+            outcomes,
+            parameters,
+            pmap,
+        }
+    }
+}
+
+#[derive(Deserialize)]
+pub struct Paths {
+    chunker: PathBuf,
+}
+
+impl BuildComponent for Chunker {
+    type Paths = Paths;
+
+    fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result<Chunker, Error> {
+        #[derive(Deserialize)]
+        struct ChunkData {
+            token_model: ModelData,
+            pos_model: ModelData,
+            pos_tagdict: DefaultHashMap<String, Vec<String>>,
+            chunk_model: ModelData,
+        }
+
+        let chunk_data: ChunkData =
+            serde_json::from_reader(BufReader::new(File::open(paths.chunker)?))?;
+        Ok(Chunker {
+            token_model: MaxentTokenizer {
+                model: chunk_data.token_model.into(),
+            },
+            pos_model: MaxentPosTagger {
+                model: chunk_data.pos_model.into(),
+                tagdict: chunk_data.pos_tagdict,
+            },
+            chunk_model: MaxentChunker {
+                model: chunk_data.chunk_model.into(),
+            },
+        })
+    }
+}
diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/components/chunker/mod.rs
similarity index 95%
rename from nlprule/src/tokenizer/chunk.rs
rename to nlprule/src/components/chunker/mod.rs
index 40ae936..8b6aad2 100644
--- a/nlprule/src/tokenizer/chunk.rs
+++ b/nlprule/src/components/chunker/mod.rs
@@ -1,12 +1,19 @@
 //! A Chunker ported from [OpenNLP](https://opennlp.apache.org/).
 
+#[cfg(feature = "compile")]
+mod compile;
+
 use half::bf16;
+use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
 use std::hash::{Hash, Hasher};
 use std::{cmp::Ordering, collections::BinaryHeap};
 
+use crate::properties::*;
 use crate::types::{DefaultHashMap, DefaultHasher, Sentence};
 
+use super::Component;
+
 fn softmax(vec: &mut Vec<f32>) {
     for x in vec.iter_mut() {
         *x = x.exp();
@@ -699,9 +706,22 @@ pub struct Chunker {
     pub(crate) chunk_model: MaxentChunker,
 }
 
-impl Chunker {
-    /// Populates the `.chunks` field of the passed tokens by predicting with the maximum entropy model.
-    pub fn apply(&self, sentence: &mut Sentence) {
+impl Transform for Chunker {
+    fn properties(&self) -> PropertiesMut {
+        lazy_static! {
+            static ref PROPERTIES: PropertiesMut = Properties::default()
+                .read(&[Property::Tags])
+                .write(&[Property::Chunks]);
+        }
+        *PROPERTIES
+    }
+
+    fn transform<'t>(
+        &'t self,
+        mut sentence: Sentence<'t>,
+    ) -> Result<Sentence<'t>, crate::properties::Error> {
+        let props = self.property_guard(&mut sentence)?;
+
         let text = sentence.text().replace('’', "\'");
 
         let mut bi_to_ci: DefaultHashMap<usize, usize> = text
@@ -757,8 +777,12 @@ impl Chunker {
                     let contains_nns = sentence
                         .iter()
                         .find(|token| *token.span().char() == char_span)
-                        .map(|token| token.tags().iter().any(|tag| tag.pos().as_str() == "NNS"))
-                        .unwrap_or(false);
+                        .map(|token| {
+                            props
+                                .tags(token)
+                                .map(|tags| tags.iter().any(|tag| tag.pos().as_str() == "NNS"))
+                        })
+                        .unwrap_or(Ok(false))?;
 
                     if contains_nns {
                         number = "plural";
@@ -791,9 +815,17 @@ impl Chunker {
         for token in sentence.iter_mut() {
             for (chunk, (_, char_span)) in chunks.iter().zip(internal_chunks.iter()) {
                 if char_span == token.span().char() {
-                    *token.chunks_mut() = (*chunk).clone();
+                    *props.chunks_mut(token)? = (*chunk).clone();
                 }
             }
         }
+
+        Ok(sentence)
+    }
+}
+
+impl Component for Chunker {
+    fn name() -> &'static str {
+        "chunker"
     }
 }
diff --git a/nlprule/src/components/mod.rs b/nlprule/src/components/mod.rs
new file mode 100644
index 0000000..8cdb152
--- /dev/null
+++ b/nlprule/src/components/mod.rs
@@ -0,0 +1,30 @@
+use std::{
+    io::{BufReader, Read, Write},
+    path::Path,
+};
+
+use fs_err::File;
+use serde::{de::DeserializeOwned, Serialize};
+
+pub mod chunker;
+pub mod multiword_tagger;
+pub mod rules;
+pub mod tagger;
+pub mod tokenizer;
+
+pub trait Component: Serialize + DeserializeOwned + Clone {
+    fn name() -> &'static str;
+
+    fn new<P: AsRef<Path>>(p: P) -> Result<Self, crate::Error> {
+        let reader = BufReader::new(File::open(p.as_ref())?);
+        Self::from_reader(reader)
+    }
+
+    fn from_reader<R: Read>(reader: R) -> Result<Self, crate::Error> {
+        Ok(bincode::deserialize_from(reader)?)
+    }
+
+    fn to_writer<W: Write>(&self, writer: W) -> Result<(), crate::Error> {
+        Ok(bincode::serialize_into(writer, self)?)
+    }
+}
diff --git a/nlprule/src/components/multiword_tagger/compile.rs b/nlprule/src/components/multiword_tagger/compile.rs
new file mode 100644
index 0000000..c02b959
--- /dev/null
+++ b/nlprule/src/components/multiword_tagger/compile.rs
@@ -0,0 +1,46 @@
+use std::{
+    io::{BufRead, BufReader},
+    path::PathBuf,
+};
+
+use fs_err::File;
+
+use crate::compile::{BuildComponent, BuildInfo, Error};
+
+use super::*;
+
+#[derive(Deserialize)]
+pub struct Paths {
+    multiword_tags: PathBuf,
+}
+
+impl BuildComponent for MultiwordTagger {
+    type Paths = Paths;
+
+    fn build(paths: Paths, info: Option<&mut BuildInfo>) -> Result<Self, Error> {
+        let tagger = info.ok_or(Error::BuildInfoUnset)?.tagger();
+
+        let reader = BufReader::new(File::open(paths.multiword_tags)?);
+        let mut multiwords = Vec::new();
+
+        for line in reader.lines() {
+            let line = line?;
+
+            // strip comments
+            let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim();
+            if line.is_empty() {
+                continue;
+            }
+            let tab_split: Vec<_> = line.split('\t').collect();
+
+            let word: String = tab_split[0]
+                .split_whitespace()
+                .collect::<Vec<_>>()
+                .join(" ");
+            let pos = tagger.id_tag(tab_split[1]).into_static();
+            multiwords.push((word, pos));
+        }
+
+        Ok((MultiwordTaggerFields { multiwords }).into())
+    }
+}
diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/components/multiword_tagger/mod.rs
similarity index 72%
rename from nlprule/src/tokenizer/multiword.rs
rename to nlprule/src/components/multiword_tagger/mod.rs
index 9af2ca7..03518fe 100644
--- a/nlprule/src/tokenizer/multiword.rs
+++ b/nlprule/src/components/multiword_tagger/mod.rs
@@ -1,12 +1,19 @@
 //! Checks if the input text contains multi-token phrases from a finite list (might contain e. g. city names) and assigns lemmas and part-of-speech tags accordingly.
 
+use crate::properties::*;
 use crate::types::*;
 use aho_corasick::AhoCorasick;
+use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
 
+use super::Component;
+
+#[cfg(feature = "compile")]
+mod compile;
+
 #[derive(Serialize, Deserialize)]
-pub(crate) struct MultiwordTaggerFields {
-    pub(crate) multiwords: Vec<(String, PosId<'static>)>,
+struct MultiwordTaggerFields {
+    multiwords: Vec<(String, PosId<'static>)>,
 }
 
 impl From<MultiwordTaggerFields> for MultiwordTagger {
@@ -36,9 +43,20 @@ pub struct MultiwordTagger {
     multiwords: Vec<(String, PosId<'static>)>,
 }
 
-impl MultiwordTagger {
-    /// Populates the `.multiword_data` field of the passed tokens by checking if any known phrases are contained.
-    pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) {
+impl Transform for MultiwordTagger {
+    fn properties(&self) -> PropertiesMut {
+        lazy_static! {
+            static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]);
+        }
+        *PROPERTIES
+    }
+
+    fn transform<'t>(
+        &'t self,
+        mut sentence: Sentence<'t>,
+    ) -> Result<Sentence<'t>, crate::properties::Error> {
+        let props = self.property_guard(&mut sentence)?;
+
         let tagger = sentence.tagger();
 
         let mut start_indices = DefaultHashMap::new();
@@ -50,7 +68,7 @@ impl MultiwordTagger {
             .enumerate()
             .map(|(i, x)| {
                 start_indices.insert(byte_index, i);
-                byte_index += x.text().0.len();
+                byte_index += x.as_str().len();
                 end_indices.insert(byte_index, i);
                 byte_index += " ".len();
 
@@ -66,11 +84,19 @@ impl MultiwordTagger {
                 let (word, pos) = &self.multiwords[m.pattern()];
                 // end index is inclusive
                 for token in sentence.iter_mut().skip(*start).take((end + 1) - start) {
-                    token.tags_mut().push(
+                    props.tags_mut(token)?.push(
                         WordData::new(tagger.id_word(word.as_str().into()), pos.clone()).freeze(),
                     );
                 }
             }
         }
+
+        Ok(sentence)
+    }
+}
+
+impl Component for MultiwordTagger {
+    fn name() -> &'static str {
+        "multiword_tagger"
     }
 }
diff --git a/nlprule/src/components/rules/compile/mod.rs b/nlprule/src/components/rules/compile/mod.rs
new file mode 100644
index 0000000..e2d6cf2
--- /dev/null
+++ b/nlprule/src/components/rules/compile/mod.rs
@@ -0,0 +1,234 @@
+mod structure;
+
+use fs_err::File;
+use std::{io::BufReader, path::PathBuf};
+use log::warn;
+
+use crate::{
+    compile::{BuildComponent, BuildInfo, Error},
+    rule::id::Category,
+};
+
+use super::*;
+
+/// Options for a disambiguator.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub(crate) struct DisambiguatorLangOptions {
+    /// Whether to allow errors while constructing the tokenizer.
+    pub allow_errors: bool,
+    /// Disambiguation Rule selectors to use in this tokenizer.
+    #[serde(default)]
+    pub ids: Vec<Selector>,
+    /// Disambiguation Rule selectors to ignore in this tokenizer.
+    #[serde(default)]
+    pub ignore_ids: Vec<Selector>,
+}
+
+#[derive(Deserialize)]
+pub struct DisambiguatorPaths {
+    disambiguator_xml: PathBuf,
+    disambiguator_options: PathBuf,
+}
+
+impl BuildComponent for Disambiguator {
+    type Paths = DisambiguatorPaths;
+
+    fn build(paths: DisambiguatorPaths, build_info: Option<&mut BuildInfo>) -> Result<Self, Error> {
+        let build_info = build_info.ok_or(Error::BuildInfoUnset)?;
+
+        let options: DisambiguatorLangOptions =
+            serde_json::from_reader(BufReader::new(File::open(&paths.disambiguator_options)?))?;
+        let rules = structure::parse::read_disambiguation_rules(paths.disambiguator_xml);
+
+        let mut error = None;
+
+        let rules: Vec<_> = rules
+            .into_iter()
+            .filter_map(|x| match x {
+                Ok((rule_structure, group, _)) => {
+                    let id = Category::new("DISAMBIGUATION");
+
+                    let id = if let Some(group) = &group {
+                        id.join(group.id.as_str()).join(group.n)
+                    } else {
+                        id.join(
+                            rule_structure
+                                .id
+                                .as_ref()
+                                .expect("ID must be set if not in group."),
+                        )
+                        .join(0)
+                    };
+
+                    match DisambiguationRule::from_rule_structure(rule_structure, build_info) {
+                        Ok(mut rule) => {
+                            if error.is_none()
+                                && (options.ids.is_empty()
+                                    || options.ids.iter().any(|x| x.is_match(&id)))
+                                && !options.ignore_ids.iter().any(|x| x.is_match(&id))
+                            {
+                                rule.id = id;
+
+                                Some(rule)
+                            } else {
+                                None
+                            }
+                        }
+                        Err(x) => {
+                            if error.is_none() {
+                                error = Some(format!("[Rule] {}", x));
+                            }
+                            None
+                        }
+                    }
+                }
+                Err(x) => {
+                    if error.is_none() {
+                        error = Some(format!("[Structure] {}", x));
+                    }
+                    None
+                }
+            })
+            .collect();
+
+        if let Some(x) = error {
+            if options.allow_errors {
+                warn!("Error constructing Disambiguator: {}", x)
+            } else {
+                return Err(Error::Unexpected(format!(
+                    "Error constructing Disambiguator: {}",
+                    x
+                )));
+            }
+        }
+
+        Ok(Disambiguator {
+            rules,
+            properties: Default::default(),
+        })
+    }
+}
+
+/// Language-dependent options for a rule set.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub(crate) struct RulesLangOptions {
+    /// Whether to allow errors while constructing the rules.
+    pub allow_errors: bool,
+    /// Grammar Rule selectors to use in this set.
+    #[serde(default)]
+    pub ids: Vec<Selector>,
+    /// Grammar Rule selectors to ignore in this set.
+    #[serde(default)]
+    pub ignore_ids: Vec<Selector>,
+}
+
+#[derive(Deserialize)]
+pub struct RulesPaths {
+    rules_xml: PathBuf,
+    rules_options: PathBuf,
+}
+
+impl BuildComponent for Rules {
+    type Paths = RulesPaths;
+
+    fn build(paths: RulesPaths, build_info: Option<&mut BuildInfo>) -> Result<Self, Error> {
+        let build_info = build_info.ok_or(Error::BuildInfoUnset)?;
+
+        let options: RulesLangOptions =
+            serde_json::from_reader(BufReader::new(File::open(&paths.rules_options)?))?;
+        let rules = structure::parse::read_rules(paths.rules_xml);
+        let mut errors: DefaultHashMap<String, usize> = DefaultHashMap::new();
+
+        let rules: Vec<_> = rules
+            .into_iter()
+            .filter_map(|x| match x {
+                Ok((rule_structure, group, category)) => {
+                    let category = category.expect("grammar rules must have category");
+                    let id = Category::new(category.id.as_str());
+
+                    let id = if let Some(group) = &group {
+                        id.join(group.id.as_str()).join(group.n)
+                    } else {
+                        id.join(
+                            rule_structure
+                                .id
+                                .as_ref()
+                                .expect("ID must be set if not in group."),
+                        )
+                        .join(0)
+                    };
+
+                    let rule_on = match rule_structure.default.as_deref() {
+                        Some("off") | Some("temp_off") => false,
+                        Some("on") | None => true,
+                        Some(x) => panic!("unknown `default` value: {}", x),
+                    };
+
+                    let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) {
+                        Some("off") | Some("temp_off") => false,
+                        Some("on") | None => true,
+                        Some(x) => panic!("unknown `default` value: {}", x),
+                    };
+
+                    let category_on = match category.default.as_deref() {
+                        Some("off") | Some("temp_off") => false,
+                        Some("on") | None => true,
+                        Some(x) => panic!("unknown `default` value: {}", x),
+                    };
+
+                    let name = rule_structure.name.as_ref().map_or_else(
+                        || {
+                            let group = group.as_ref().expect("must have group if name not set");
+                            group.name.clone()
+                        },
+                        |x| x.clone(),
+                    );
+
+                    match Rule::from_rule_structure(rule_structure, build_info) {
+                        Ok(mut rule) => {
+                            if (options.ids.is_empty()
+                                || options.ids.iter().any(|x| x.is_match(&id)))
+                                && !options.ignore_ids.iter().any(|x| x.is_match(&id))
+                            {
+                                rule.id = id;
+                                rule.name = name;
+                                rule.category_name = category.name;
+                                rule.category_type = category.kind;
+                                rule.enabled = category_on && group_on && rule_on;
+                                Some(rule)
+                            } else {
+                                None
+                            }
+                        }
+                        Err(x) => {
+                            *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1;
+                            None
+                        }
+                    }
+                }
+                Err(x) => {
+                    *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1;
+                    None
+                }
+            })
+            .collect();
+
+        if !errors.is_empty() {
+            let mut errors: Vec<(String, usize)> = errors.into_iter().collect();
+            errors.sort_by_key(|x| -(x.1 as i32));
+
+            warn!(
+                "Errors constructing Rules: {:#?}",
+                &errors
+                    .iter()
+                    .map(|(message, number)| format!("{} (n={})", message, number))
+                    .collect::<Vec<_>>()
+            );
+        }
+
+        Ok(Rules {
+            rules,
+            properties: Default::default(),
+        })
+    }
+}
diff --git a/nlprule/src/components/rules/compile/structure/impls.rs b/nlprule/src/components/rules/compile/structure/impls.rs
new file mode 100644
index 0000000..27920df
--- /dev/null
+++ b/nlprule/src/components/rules/compile/structure/impls.rs
@@ -0,0 +1,375 @@
+use std::{
+    collections::hash_map::DefaultHasher,
+    hash::{Hash, Hasher},
+};
+
+use crate::utils::parallelism::MaybeParallelIterator;
+use crate::{
+    compile::{BuildInfo, Error},
+    rule::engine::{composition::*, Engine},
+    utils::regex::Regex,
+};
+use crate::{rule::disambiguation::PosFilter, types::*};
+
+impl TextMatcher {
+    pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result<Self, Error> {
+        // can not cache a matcher that depends on the graph
+        let set = if matcher.graph_id().is_some() {
+            None
+        } else if let either::Right(regex) = &matcher.matcher {
+            let mut hasher = DefaultHasher::default();
+            regex.hash(&mut hasher);
+            matcher.negate.hash(&mut hasher);
+            matcher.empty_always_false.hash(&mut hasher);
+            let matcher_hash = hasher.finish();
+
+            if let Some(set) = info.mut_regex_cache().get(&matcher_hash) {
+                set.clone()
+            } else {
+                let data: Vec<_> = info.tagger().word_store().iter().collect();
+
+                let set: DefaultHashSet<_> = data
+                    .into_maybe_par_iter()
+                    .filter_map(|(word, id)| {
+                        if matcher.is_match(word.as_str(), None, None) {
+                            Some(*id)
+                        } else {
+                            None
+                        }
+                    })
+                    .collect();
+
+                // there are some regexes which match lots of strings
+                // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up
+                // the vast majority of regexes matches less than 100 strings from manual inspection
+                let set = if set.len() > 100 { None } else { Some(set) };
+                info.mut_regex_cache().insert(matcher_hash, set.clone());
+                set
+            }
+        } else {
+            None
+        };
+
+        Ok(TextMatcher { matcher, set })
+    }
+}
+
+impl PosMatcher {
+    pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result<Self, Error> {
+        let mut mask = vec![false; info.tagger().tag_store().len()];
+
+        for (word, id) in info.tagger().tag_store().iter() {
+            mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None);
+        }
+
+        Ok(PosMatcher { mask })
+    }
+}
+
+impl PosFilter {
+    pub fn new(matcher: PosMatcher) -> Self {
+        PosFilter { matcher }
+    }
+}
+
+impl Regex {
+    pub fn from_java_regex(
+        java_regex_str: &str,
+        full_match: bool,
+        case_sensitive: bool,
+    ) -> Result<Self, Error> {
+        let regex_string =
+            crate::compile::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?;
+
+        let regex = Regex::new(regex_string);
+        if let Err(error) = regex.try_compile() {
+            return Err(Error::Regex(error));
+        }
+
+        Ok(regex)
+    }
+}
+
+impl Engine {
+    pub fn to_graph_id(&self, id: usize) -> Result<GraphId, Error> {
+        let mut id = GraphId(id);
+
+        let map = match &self {
+            Engine::Token(engine) => &engine.composition.id_to_idx,
+            Engine::Text(_, id_to_idx) => &id_to_idx,
+        };
+
+        let max_id = *map
+            .keys()
+            .max()
+            .ok_or_else(|| Error::Unexpected("graph is empty".into()))?;
+
+        // ideally this should throw an error but LT is more lenient than nlprule
+        if !map.contains_key(&id) {
+            id = max_id;
+        }
+
+        Ok(id)
+    }
+}
+
+mod composition {
+    use super::*;
+    use crate::{
+        rule::engine::composition::{
+            AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part,
+            Quantifier, TrueAtom,
+        },
+        utils::regex::Regex,
+    };
+
+    impl Atom {
+        fn iter_mut<'a>(&'a mut self) -> Box<dyn Iterator<Item = &'a mut Atom> + 'a> {
+            match self {
+                Atom::ChunkAtom(_)
+                | Atom::SpaceBeforeAtom(_)
+                | Atom::TextAtom(_)
+                | Atom::WordDataAtom(_)
+                | Atom::FalseAtom(_)
+                | Atom::TrueAtom(_) => Box::new(std::iter::once(self)),
+                Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()),
+                Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()),
+                Atom::NotAtom(x) => x.atom.iter_mut(),
+                Atom::OffsetAtom(x) => x.atom.iter_mut(),
+            }
+        }
+
+        pub fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> {
+            let mut ids = Vec::new();
+
+            for atom in self.iter_mut() {
+                let id = match atom {
+                    Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(),
+                    Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(),
+                    Atom::WordDataAtom(atom) => atom
+                        .matcher
+                        .inflect_matcher
+                        .as_mut()
+                        .and_then(|x| x.matcher.mut_graph_id()),
+                    _ => {
+                        continue;
+                    }
+                };
+
+                if let Some(id) = id {
+                    ids.push(id);
+                }
+            }
+
+            ids
+        }
+    }
+
+    impl Matcher {
+        pub fn new_regex(regex: Regex, negate: bool, empty_always_false: bool) -> Self {
+            Matcher {
+                matcher: either::Right(regex),
+                negate,
+                case_sensitive: true, // handled by regex, should maybe be an option
+                empty_always_false,
+            }
+        }
+
+        pub fn new_string(
+            string_or_idx: either::Either<String, GraphId>,
+            negate: bool,
+            case_sensitive: bool,
+            empty_always_false: bool,
+        ) -> Self {
+            Matcher {
+                matcher: either::Left(string_or_idx),
+                negate,
+                case_sensitive,
+                empty_always_false,
+            }
+        }
+
+        pub fn graph_id(&self) -> Option<GraphId> {
+            if let either::Left(either::Right(id)) = &self.matcher {
+                Some(*id)
+            } else {
+                None
+            }
+        }
+
+        pub fn mut_graph_id(&mut self) -> Option<&mut GraphId> {
+            if let either::Left(either::Right(id)) = &mut self.matcher {
+                Some(id)
+            } else {
+                None
+            }
+        }
+    }
+
+    impl Quantifier {
+        pub fn new(min: usize, max: usize) -> Self {
+            assert!(max >= min);
+            Quantifier { min, max }
+        }
+    }
+
+    impl AndAtom {
+        pub fn and(atoms: Vec<Atom>) -> Atom {
+            let mut atoms: Vec<_> = atoms
+                .into_iter()
+                .filter(|x| !matches!(x, Atom::TrueAtom { .. }))
+                .collect();
+
+            if atoms.is_empty() {
+                (TrueAtom {}).into()
+            } else if atoms.len() == 1 {
+                atoms.remove(0)
+            } else {
+                (AndAtom { atoms }).into()
+            }
+        }
+    }
+
+    impl OrAtom {
+        pub fn or(atoms: Vec<Atom>) -> Atom {
+            let mut atoms: Vec<_> = atoms
+                .into_iter()
+                .filter(|x| !matches!(x, Atom::FalseAtom { .. }))
+                .collect();
+
+            if atoms.is_empty() {
+                (FalseAtom {}).into()
+            } else if atoms.len() == 1 {
+                atoms.remove(0)
+            } else {
+                (OrAtom { atoms }).into()
+            }
+        }
+    }
+
+    impl NotAtom {
+        pub fn not(atom: Atom) -> Atom {
+            match atom {
+                Atom::TrueAtom { .. } => FalseAtom::default().into(),
+                Atom::FalseAtom { .. } => TrueAtom::default().into(),
+                x => (NotAtom { atom: Box::new(x) }).into(),
+            }
+        }
+    }
+
+    impl OffsetAtom {
+        pub fn new(atom: Atom, offset: isize) -> Self {
+            OffsetAtom {
+                atom: Box::new(atom),
+                offset,
+            }
+        }
+    }
+
+    impl Composition {
+        pub fn new(mut parts: Vec<Part>) -> Result<Self, Error> {
+            let mut id_to_idx = DefaultHashMap::default();
+            id_to_idx.insert(GraphId(0), 0);
+            let mut current_id = 1;
+
+            for (i, part) in parts.iter().enumerate() {
+                if part.visible {
+                    id_to_idx.insert(GraphId(current_id), i + 1);
+                    current_id += 1;
+                }
+            }
+
+            let can_stop_mask = (0..parts.len())
+                .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0))
+                .collect();
+
+            for (i, part) in parts.iter_mut().enumerate() {
+                for id in part.atom.mut_graph_ids() {
+                    loop {
+                        let index = *id_to_idx.get(&id).ok_or_else(|| {
+                            Error::Unexpected(format!("id must exist in graph: {:?}", id))
+                        })?;
+
+                        // ideally this should throw an error but LT is more lenient than nlprule
+                        if index > i {
+                            *id = GraphId(id.0 - 1);
+                        } else {
+                            break;
+                        }
+                    }
+                }
+            }
+
+            Ok(Composition {
+                parts,
+                id_to_idx,
+                can_stop_mask,
+            })
+        }
+    }
+}
+
+pub mod filters {
+    use super::Error;
+    use std::collections::HashMap;
+
+    use crate::{filter::*, rule::engine::Engine, utils::regex::Regex};
+
+    trait FromArgs: Sized {
+        fn from_args(args: HashMap<String, String>, engine: &Engine) -> Result<Self, Error>;
+    }
+
+    impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter {
+        fn from_args(args: HashMap<String, String>, engine: &Engine) -> Result<Self, Error> {
+            if args.contains_key("negate_postag") {
+                panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter");
+            }
+
+            Ok(NoDisambiguationEnglishPartialPosTagFilter {
+                id: engine.to_graph_id(args
+                    .get("no")
+                    .ok_or_else(|| {
+                        Error::Unexpected(
+                            "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument"
+                                .into(),
+                        )
+                    })?
+                    .parse::<usize>()?)?,
+                regexp: Regex::from_java_regex(
+                    &args.get("regexp").ok_or_else(|| {
+                        Error::Unexpected(
+                        "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument"
+                            .into(),
+                    )
+                    })?,
+                    true,
+                    true,
+                )?,
+                postag_regexp: Regex::from_java_regex(
+                    &args.get("postag_regexp").ok_or_else(|| {
+                        Error::Unexpected(
+                        "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument"
+                            .into(),
+                    )
+                    })?,
+                    true,
+                    true,
+                )?,
+                negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"),
+            })
+        }
+    }
+
+    pub fn get_filter(
+        name: &str,
+        args: HashMap<String, String>,
+        engine: &Engine,
+    ) -> Result<Filter, Error> {
+        match name {
+            "NoDisambiguationEnglishPartialPosTagFilter" => {
+                Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into())
+            }
+            _ => Err(Error::Unexpected(format!("unsupported filter {}", name))),
+        }
+    }
+}
diff --git a/nlprule/src/compile/structure.rs b/nlprule/src/components/rules/compile/structure/mod.rs
similarity index 79%
rename from nlprule/src/compile/structure.rs
rename to nlprule/src/components/rules/compile/structure/mod.rs
index eb38b43..883b9fc 100644
--- a/nlprule/src/compile/structure.rs
+++ b/nlprule/src/components/rules/compile/structure/mod.rs
@@ -1,7 +1,7 @@
-use fs_err::File;
 use serde::Deserialize;
-use std::io::BufReader;
-use xml::reader::EventReader;
+
+pub mod impls;
+pub mod parse;
 
 mod preprocess {
     use std::{borrow::Cow, str::FromStr};
@@ -639,154 +639,3 @@ pub enum DisambiguationRuleContainer {
     RuleGroup(DisambiguationRuleGroup),
     Unification(Unification),
 }
-
-macro_rules! flatten_group {
-    ($rulegroup:expr, $category:expr) => {{
-        let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns {
-            antipatterns
-        } else {
-            Vec::new()
-        };
-
-        let group = Group {
-            id: $rulegroup.id,
-            default: $rulegroup.default,
-            name: $rulegroup.name,
-            n: 0,
-        };
-
-        $rulegroup
-            .rules
-            .into_iter()
-            .enumerate()
-            .map(|(i, mut rule)| {
-                if let Some(antipatterns) = &mut rule.antipatterns {
-                    antipatterns.extend(group_antipatterns.clone());
-                } else {
-                    rule.antipatterns = Some(group_antipatterns.clone());
-                }
-
-                let mut group = group.clone();
-                group.n = i;
-                (rule, Some(group), $category.clone())
-            })
-            .collect::<Vec<_>>()
-    }};
-}
-
-type GrammarRuleReading = (Rule, Option<Group>, Option<Category>);
-type DisambiguationRuleReading = (DisambiguationRule, Option<Group>, Option<Category>);
-
-pub fn read_rules<P: AsRef<std::path::Path>>(
-    path: P,
-) -> Vec<Result<GrammarRuleReading, serde_xml_rs::Error>> {
-    let file = File::open(path.as_ref()).unwrap();
-    let file = BufReader::new(file);
-
-    let sanitized = preprocess::sanitize(file, &["suggestion"]);
-    let rules = preprocess::extract_rules(sanitized.as_bytes());
-
-    let mut unifications = Vec::new();
-
-    let rules: Vec<_> = rules
-        .into_iter()
-        .map(|(xml, category)| {
-            let mut out = Vec::new();
-
-            let deseralized = RuleContainer::deserialize(&mut serde_xml_rs::Deserializer::new(
-                EventReader::new(xml.as_bytes()),
-            ));
-
-            out.extend(match deseralized {
-                Ok(rule_container) => match rule_container {
-                    RuleContainer::Rule(rule) => {
-                        vec![Ok((rule, None, category))]
-                    }
-                    RuleContainer::RuleGroup(rule_group) => flatten_group!(rule_group, category)
-                        .into_iter()
-                        .map(Ok)
-                        .collect(),
-                    RuleContainer::Unification(unification) => {
-                        unifications.push(unification);
-
-                        vec![]
-                    }
-                },
-                Err(err) => vec![Err(err)],
-            });
-            out
-        })
-        .flatten()
-        .collect();
-
-    rules
-        .into_iter()
-        .map(|result| match result {
-            Ok(mut x) => {
-                x.0.unifications = Some(unifications.clone());
-
-                Ok(x)
-            }
-            Err(x) => Err(x),
-        })
-        .collect()
-}
-
-pub fn read_disambiguation_rules<P: AsRef<std::path::Path>>(
-    path: P,
-) -> Vec<Result<DisambiguationRuleReading, serde_xml_rs::Error>> {
-    let file = File::open(path.as_ref()).unwrap();
-    let file = BufReader::new(file);
-
-    let sanitized = preprocess::sanitize(file, &[]);
-    let rules = preprocess::extract_rules(sanitized.as_bytes());
-
-    let mut unifications = Vec::new();
-
-    let rules: Vec<_> = rules
-        .into_iter()
-        .map(|(xml, _)| {
-            let mut out = Vec::new();
-
-            let deseralized = DisambiguationRuleContainer::deserialize(
-                &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())),
-            );
-
-            let category: Option<Category> = None;
-
-            out.extend(match deseralized {
-                Ok(rule_container) => match rule_container {
-                    DisambiguationRuleContainer::Rule(rule) => {
-                        vec![Ok((rule, None, category))]
-                    }
-                    DisambiguationRuleContainer::RuleGroup(rule_group) => {
-                        flatten_group!(rule_group, category)
-                            .into_iter()
-                            .map(Ok)
-                            .collect()
-                    }
-                    DisambiguationRuleContainer::Unification(unification) => {
-                        unifications.push(unification);
-
-                        vec![]
-                    }
-                },
-                Err(err) => vec![Err(err)],
-            });
-            out
-        })
-        .flatten()
-        .collect();
-
-    rules
-        .into_iter()
-        .map(|result| match result {
-            Ok(mut x) => {
-                x.0.unifications = Some(unifications.clone());
-
-                Ok(x)
-            }
-            Err(x) => Err(x),
-        })
-        .collect()
-}
diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/components/rules/compile/structure/parse.rs
similarity index 72%
rename from nlprule/src/compile/parse_structure.rs
rename to nlprule/src/components/rules/compile/structure/parse.rs
index 0be9924..f6449a6 100644
--- a/nlprule/src/compile/parse_structure.rs
+++ b/nlprule/src/components/rules/compile/structure/parse.rs
@@ -1,12 +1,12 @@
-use std::{ops::Range, sync::Arc};
+use std::{io::BufReader, ops::Range};
 
-use super::{structure, Error};
-use crate::{tokenizer::tag::Tagger, types::*};
+use crate::compile::{BuildInfo, Error};
+use crate::types::*;
 use crate::{utils, utils::regex::Regex};
+use fs_err::File;
 use lazy_static::lazy_static;
-use serde::{Deserialize, Serialize};
-
-pub use structure::{read_disambiguation_rules, read_rules};
+use serde::Deserialize;
+use serde_xml_rs::EventReader;
 
 use crate::rule::disambiguation::*;
 use crate::rule::engine::composition::concrete::*;
@@ -15,64 +15,16 @@ use crate::rule::engine::*;
 use crate::rule::grammar::*;
 use crate::rule::{id::Index, DisambiguationRule, Rule, Unification};
 
+use super::Category;
+
 // this is set arbitrarily at the moment, could be an option
 #[inline]
 fn max_matches() -> usize {
     20
 }
 
-#[derive(Serialize, Deserialize, Debug)]
-pub(crate) struct RegexCache {
-    cache: DefaultHashMap<u64, Option<DefaultHashSet<WordIdInt>>>,
-    // this is compared with the hash of the word store of the tagger
-    word_hash: u64,
-}
-
-impl RegexCache {
-    pub fn new(word_hash: u64) -> Self {
-        RegexCache {
-            cache: DefaultHashMap::default(),
-            word_hash,
-        }
-    }
-
-    pub fn word_hash(&self) -> &u64 {
-        &self.word_hash
-    }
-
-    pub(crate) fn get(&self, key: &u64) -> Option<&Option<DefaultHashSet<WordIdInt>>> {
-        self.cache.get(key)
-    }
-
-    pub(crate) fn insert(&mut self, key: u64, value: Option<DefaultHashSet<WordIdInt>>) {
-        self.cache.insert(key, value);
-    }
-}
-
-pub(crate) struct BuildInfo {
-    tagger: Arc<Tagger>,
-    regex_cache: RegexCache,
-}
-
-impl BuildInfo {
-    pub fn new(tagger: Arc<Tagger>, regex_cache: RegexCache) -> Self {
-        BuildInfo {
-            tagger,
-            regex_cache,
-        }
-    }
-
-    pub fn tagger(&self) -> &Arc<Tagger> {
-        &self.tagger
-    }
-
-    pub fn mut_regex_cache(&mut self) -> &mut RegexCache {
-        &mut self.regex_cache
-    }
-}
-
 fn parse_match_attribs(
-    attribs: impl structure::MatchAttributes,
+    attribs: impl super::MatchAttributes,
     text: Option<&str>,
     case_sensitive: bool,
     text_match_idx: Option<usize>,
@@ -149,11 +101,11 @@ fn parse_match_attribs(
         };
 
         if inflected {
-            inflect_matcher = Some(matcher);
+            inflect_matcher = Some(TextMatcher::new(matcher, info)?);
         } else {
             atoms.push(
                 (TextAtom {
-                    matcher: TextMatcher::new(matcher, info),
+                    matcher: TextMatcher::new(matcher, info)?,
                 })
                 .into(),
             );
@@ -172,13 +124,13 @@ fn parse_match_attribs(
                 true,
             )
         };
-        pos_matcher = Some(PosMatcher::new(raw_matcher, info));
+        pos_matcher = Some(PosMatcher::new(raw_matcher, info)?);
     }
 
     if pos_matcher.is_some() || inflect_matcher.is_some() {
         let matcher = WordDataMatcher {
             pos_matcher,
-            inflect_matcher: inflect_matcher.map(|x| TextMatcher::new(x, info)),
+            inflect_matcher,
         };
         atoms.push(
             (WordDataAtom {
@@ -234,7 +186,7 @@ fn parse_match_attribs(
 }
 
 fn get_exceptions(
-    token: &structure::Token,
+    token: &super::Token,
     case_sensitive: bool,
     only_shifted: bool,
     info: &mut BuildInfo,
@@ -243,15 +195,11 @@ fn get_exceptions(
         let exceptions: Vec<Atom> = parts
             .iter()
             .filter_map(|x| match x {
-                structure::TokenPart::Exception(x) => Some(x),
+                super::TokenPart::Exception(x) => Some(x),
                 _ => None,
             })
             .filter_map(|x| {
-                let exception_text = if let Some(exception_text) = &x.text {
-                    Some(exception_text.as_str())
-                } else {
-                    None
-                };
+                let exception_text = x.text.as_ref().map(|x| x.as_str());
                 let mut atom =
                     match parse_match_attribs(x, exception_text, case_sensitive, None, info) {
                         Ok(atom) => atom,
@@ -287,14 +235,14 @@ fn get_exceptions(
 }
 
 fn parse_token(
-    token: &structure::Token,
+    token: &super::Token,
     case_sensitive: bool,
     info: &mut BuildInfo,
 ) -> Result<Vec<Part>, Error> {
     let mut parts = Vec::new();
     let text = if let Some(parts) = &token.parts {
         parts.iter().find_map(|x| match x {
-            structure::TokenPart::Text(text) => Some(text.as_str()),
+            super::TokenPart::Text(text) => Some(text.as_str()),
             _ => None,
         })
     } else {
@@ -303,7 +251,7 @@ fn parse_token(
 
     let text_match_idx = if let Some(parts) = &token.parts {
         match parts.iter().find_map(|x| match x {
-            structure::TokenPart::Sub(sub) => Some(sub.no.parse::<usize>().map(|x| x + 1)),
+            super::TokenPart::Sub(sub) => Some(sub.no.parse::<usize>().map(|x| x + 1)),
             _ => None,
         }) {
             None => None,
@@ -374,7 +322,7 @@ fn parse_token(
     Ok(parts)
 }
 
-fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Result<Match, Error> {
+fn parse_match(m: super::Match, engine: &Engine, info: &mut BuildInfo) -> Result<Match, Error> {
     if m.postag.is_some()
         || m.postag_regex.is_some()
         || m.postag_replace.is_some()
@@ -396,11 +344,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re
         m.no.parse::<usize>()
             .expect("no must be parsable as usize.");
 
-    let case_conversion = if let Some(conversion) = &m.case_conversion {
-        Some(conversion.as_str())
-    } else {
-        None
-    };
+    let case_conversion = m.case_conversion.as_deref();
 
     let pos_replacer = if let Some(postag) = m.postag {
         if postag.contains("+DT") || postag.contains("+INDT") {
@@ -418,7 +362,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re
             x => panic!("unknown postag_regex value {:?}", x),
         };
         Some(PosReplacer {
-            matcher: PosMatcher::new(matcher, info),
+            matcher: PosMatcher::new(matcher, info)?,
         })
     } else {
         None
@@ -495,17 +439,17 @@ fn parse_synthesizer_text(text: &str, engine: &Engine) -> Result<Vec<Synthesizer
 }
 
 fn parse_suggestion(
-    data: structure::Suggestion,
+    data: super::Suggestion,
     engine: &Engine,
     info: &mut BuildInfo,
 ) -> Result<Synthesizer, Error> {
     let mut parts = Vec::new();
     for part in data.parts {
         match part {
-            structure::SuggestionPart::Text(text) => {
+            super::SuggestionPart::Text(text) => {
                 parts.extend(parse_synthesizer_text(text.as_str(), engine)?);
             }
-            structure::SuggestionPart::Match(m) => {
+            super::SuggestionPart::Match(m) => {
                 parts.push(SynthesizerPart::Match(parse_match(m, engine, info)?.into()));
             }
         }
@@ -523,7 +467,7 @@ fn get_last_id(parts: &[Part]) -> isize {
 }
 
 fn parse_parallel_tokens(
-    tokens: &[structure::Token],
+    tokens: &[super::Token],
     case_sensitive: bool,
     info: &mut BuildInfo,
 ) -> Result<Vec<Atom>, Error> {
@@ -544,7 +488,7 @@ fn parse_parallel_tokens(
 }
 
 fn parse_tokens(
-    tokens: &[structure::TokenCombination],
+    tokens: &[super::TokenCombination],
     case_sensitive: bool,
     info: &mut BuildInfo,
 ) -> Result<Vec<Part>, Error> {
@@ -552,8 +496,8 @@ fn parse_tokens(
 
     for token_combination in tokens {
         out.extend(match token_combination {
-            structure::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?,
-            structure::TokenCombination::And(tokens) => {
+            super::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?,
+            super::TokenCombination::And(tokens) => {
                 let atom =
                     AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
                 vec![Part {
@@ -564,7 +508,7 @@ fn parse_tokens(
                     unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                 }]
             }
-            structure::TokenCombination::Or(tokens) => {
+            super::TokenCombination::Or(tokens) => {
                 let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
                 vec![Part {
                     atom,
@@ -574,7 +518,7 @@ fn parse_tokens(
                     unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                 }]
             }
-            structure::TokenCombination::Feature(_) => Vec::new(),
+            super::TokenCombination::Feature(_) => Vec::new(),
         });
     }
 
@@ -582,7 +526,7 @@ fn parse_tokens(
 }
 
 fn parse_pattern(
-    pattern: structure::Pattern,
+    pattern: super::Pattern,
     info: &mut BuildInfo,
 ) -> Result<(Composition, usize, usize), Error> {
     let mut start = None;
@@ -596,17 +540,17 @@ fn parse_pattern(
 
     for part in &pattern.parts {
         match part {
-            structure::PatternPart::Token(token) => {
+            super::PatternPart::Token(token) => {
                 composition_parts.extend(parse_token(token, case_sensitive, info)?)
             }
-            structure::PatternPart::Marker(marker) => {
+            super::PatternPart::Marker(marker) => {
                 start = Some(get_last_id(&composition_parts));
 
                 composition_parts.extend(parse_tokens(&marker.tokens, case_sensitive, info)?);
 
                 end = Some(get_last_id(&composition_parts));
             }
-            structure::PatternPart::And(tokens) => {
+            super::PatternPart::And(tokens) => {
                 let atom =
                     AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
 
@@ -618,7 +562,7 @@ fn parse_pattern(
                     unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                 });
             }
-            structure::PatternPart::Or(tokens) => {
+            super::PatternPart::Or(tokens) => {
                 let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?);
 
                 composition_parts.push(Part {
@@ -629,7 +573,7 @@ fn parse_pattern(
                     unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"),
                 });
             }
-            structure::PatternPart::Feature(_) => {}
+            super::PatternPart::Feature(_) => {}
         }
     }
 
@@ -642,12 +586,12 @@ fn parse_pattern(
 }
 
 fn parse_features(
-    pattern: &structure::Pattern,
-    unifications: &Option<Vec<structure::Unification>>,
+    pattern: &super::Pattern,
+    unifications: &Option<Vec<super::Unification>>,
     info: &mut BuildInfo,
-) -> Vec<Vec<PosFilter>> {
+) -> Result<Vec<Vec<PosFilter>>, Error> {
     let mut filters = Vec::new();
-    let mut parse_feature = |id: &str| -> Vec<PosFilter> {
+    let mut parse_feature = |id: &str| -> Result<Vec<PosFilter>, Error> {
         let unification = unifications
             .as_ref()
             .unwrap()
@@ -670,11 +614,11 @@ fn parse_features(
 
     for part in &pattern.parts {
         match part {
-            structure::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)),
-            structure::PatternPart::Marker(marker) => {
+            super::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)?),
+            super::PatternPart::Marker(marker) => {
                 for token_combination in &marker.tokens {
-                    if let structure::TokenCombination::Feature(feature) = token_combination {
-                        filters.push(parse_feature(&feature.id));
+                    if let super::TokenCombination::Feature(feature) = token_combination {
+                        filters.push(parse_feature(&feature.id)?);
                     }
                 }
             }
@@ -682,14 +626,11 @@ fn parse_features(
         }
     }
 
-    filters
+    Ok(filters)
 }
 
 impl Rule {
-    pub(crate) fn from_rule_structure(
-        data: structure::Rule,
-        info: &mut BuildInfo,
-    ) -> Result<Rule, Error> {
+    pub fn from_rule_structure(data: super::Rule, info: &mut BuildInfo) -> Result<Rule, Error> {
         if data.filter.is_some() {
             return Err(Error::Unimplemented(
                 "rules with filter are not implemented.".into(),
@@ -756,7 +697,7 @@ impl Rule {
         };
 
         let unify_data = if let Some(pattern) = &data.pattern {
-            let unify_filters = parse_features(&pattern, &data.unifications, info);
+            let unify_filters = parse_features(&pattern, &data.unifications, info)?;
             let unify_mask: Vec<_> = maybe_composition
                 .unwrap()
                 .parts
@@ -773,16 +714,16 @@ impl Rule {
 
         for part in data.message.parts {
             match part {
-                structure::MessagePart::Suggestion(suggestion) => {
+                super::MessagePart::Suggestion(suggestion) => {
                     let suggester = parse_suggestion(suggestion.clone(), &engine, info)?;
                     // simpler to just parse a second time than cloning the result
                     message_parts.extend(parse_suggestion(suggestion, &engine, info)?.parts);
                     suggesters.push(suggester);
                 }
-                structure::MessagePart::Text(text) => {
+                super::MessagePart::Text(text) => {
                     message_parts.extend(parse_synthesizer_text(text.as_str(), &engine)?);
                 }
-                structure::MessagePart::Match(m) => {
+                super::MessagePart::Match(m) => {
                     message_parts.push(SynthesizerPart::Match(
                         parse_match(m, &engine, info)?.into(),
                     ));
@@ -817,10 +758,10 @@ impl Rule {
 
             for part in &example.parts {
                 match part {
-                    structure::ExamplePart::Text(text) => {
+                    super::ExamplePart::Text(text) => {
                         texts.push(text.as_str());
                     }
-                    structure::ExamplePart::Marker(marker) => {
+                    super::ExamplePart::Marker(marker) => {
                         let (bytes_before, chars_before) =
                             texts.iter().fold((0, 0), |acc, text| {
                                 (acc.0 + text.len(), acc.1 + text.chars().count())
@@ -911,6 +852,8 @@ fn parse_tag_form(
     is_sentence_end: bool,
     info: &mut BuildInfo,
 ) -> Result<Tags<'static>, Error> {
+    let tagger = info.tagger();
+
     lazy_static! {
         static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into());
     }
@@ -922,7 +865,7 @@ fn parse_tag_form(
     let text = captures.get(1).expect("1st regex group exists").as_str();
     let tags = captures.get(2).expect("2nd regex group exists").as_str();
 
-    let mut tag_vec: Vec<_> = tags
+    let mut tags: Vec<_> = tags
         .split(',')
         .filter_map(|x| {
             if x == "</S>" {
@@ -935,44 +878,46 @@ fn parse_tag_form(
                 None
             } else {
                 Some(WordData::new(
-                    info.tagger.id_word(parts[0].to_owned().into()),
-                    info.tagger.id_tag(parts[1]).into_static(),
+                    tagger.id_word(parts[0].to_owned().into()),
+                    tagger.id_tag(parts[1]).into_static(),
                 ))
             }
         })
         .collect();
 
-    tag_vec.push(
+    tags.push(
         WordData::new(
-            info.tagger.id_word(text.to_owned().into()),
+            tagger.id_word(text.to_owned().into()),
             PosId::special(SpecialPos::None),
         )
         .freeze(),
     );
 
     if is_sentence_end {
-        tag_vec.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze());
+        tags.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze());
     }
 
-    let tags = Tags::new(tag_vec);
-
-    Ok(tags)
+    Ok(Tags::new(WordId::empty(), tags))
 }
 
 impl WordData<'static> {
-    fn from_structure(data: structure::WordData, info: &mut BuildInfo) -> Self {
-        WordData::new(
-            info.tagger
+    fn from_structure(data: super::WordData, info: &mut BuildInfo) -> Result<Self, Error> {
+        Ok(WordData::new(
+            info.tagger()
                 .id_word(data.lemma.unwrap_or_else(String::new).into()),
-            info.tagger
+            info.tagger()
                 .id_tag(data.pos.as_deref().unwrap_or("").trim())
                 .into_static(),
-        )
+        ))
     }
 }
 
-fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter {
-    match postag_regexp.as_deref() {
+fn parse_pos_filter(
+    postag: &str,
+    postag_regexp: Option<&str>,
+    info: &mut BuildInfo,
+) -> Result<PosFilter, Error> {
+    Ok(match postag_regexp.as_deref() {
         Some("yes") => PosFilter::new(PosMatcher::new(
             Matcher::new_regex(
                 Regex::from_java_regex(&postag, true, true).unwrap(),
@@ -980,23 +925,23 @@ fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildI
                 true,
             ),
             info,
-        )),
+        )?),
         Some(_) | None => PosFilter::new(PosMatcher::new(
             Matcher::new_string(either::Left(postag.into()), false, false, true),
             info,
-        )),
-    }
+        )?),
+    })
 }
 
 impl DisambiguationRule {
-    pub(crate) fn from_rule_structure(
-        data: structure::DisambiguationRule,
+    pub fn from_rule_structure(
+        data: super::DisambiguationRule,
         info: &mut BuildInfo,
     ) -> Result<DisambiguationRule, Error> {
         // might need the pattern later so clone it here
         let (composition, start, end) = parse_pattern(data.pattern.clone(), info)?;
 
-        let unify_filters = parse_features(&data.pattern, &data.unifications, info);
+        let unify_filters = parse_features(&data.pattern, &data.unifications, info)?;
         let unify_mask: Vec<_> = composition.parts.iter().map(|part| part.unify).collect();
 
         let antipatterns = if let Some(antipatterns) = data.antipatterns {
@@ -1025,25 +970,25 @@ impl DisambiguationRule {
         let word_datas: Vec<_> = if let Some(wds) = data.disambig.word_datas {
             wds.into_iter()
                 .map(|part| match part {
-                    structure::DisambiguationPart::WordData(x) => {
-                        either::Left(WordData::from_structure(x, info))
+                    super::DisambiguationPart::WordData(x) => {
+                        WordData::from_structure(x, info).map(either::Left)
+                    }
+                    super::DisambiguationPart::Match(x) => {
+                        parse_pos_filter(&x.postag.unwrap(), x.postag_regexp.as_deref(), info)
+                            .map(either::Right)
                     }
-                    structure::DisambiguationPart::Match(x) => either::Right(parse_pos_filter(
-                        &x.postag.unwrap(),
-                        x.postag_regexp.as_deref(),
-                        info,
-                    )),
                 })
-                .collect()
+                .collect::<Result<_, Error>>()?
         } else {
             Vec::new()
         };
 
+        let tagger = info.tagger();
         let disambiguations = match data.disambig.action.as_deref() {
             Some("remove") => {
                 if let Some(postag) = data.disambig.postag.as_ref() {
                     Ok(Disambiguation::Remove(vec![either::Right(
-                        parse_pos_filter(postag, Some("yes"), info),
+                        parse_pos_filter(postag, Some("yes"), info)?,
                     )]))
                 } else {
                     Ok(Disambiguation::Remove(word_datas.into_iter().collect()))
@@ -1081,45 +1026,59 @@ impl DisambiguationRule {
 
                 for part in &data.pattern.parts {
                     match part {
-                        structure::PatternPart::Marker(marker) => {
+                        super::PatternPart::Marker(marker) => {
                             has_marker = true;
                             for token in &marker.tokens {
                                 let token = match token {
-                                    structure::TokenCombination::Token(token) => token,
-                                    structure::TokenCombination::And(tokens)
-                                    | structure::TokenCombination::Or(tokens) => &tokens.tokens[0],
-                                    structure::TokenCombination::Feature(_) => continue,
+                                    super::TokenCombination::Token(token) => token,
+                                    super::TokenCombination::And(tokens)
+                                    | super::TokenCombination::Or(tokens) => &tokens.tokens[0],
+                                    super::TokenCombination::Feature(_) => continue,
                                 };
 
-                                marker_disambig.push(token.postag.as_ref().map(|x| {
-                                    either::Right(parse_pos_filter(
-                                        x,
-                                        token.postag_regexp.as_deref(),
-                                        info,
-                                    ))
-                                }));
+                                marker_disambig.push(
+                                    token
+                                        .postag
+                                        .as_ref()
+                                        .map(|x| {
+                                            parse_pos_filter(
+                                                x,
+                                                token.postag_regexp.as_deref(),
+                                                info,
+                                            )
+                                            .map(either::Right)
+                                        })
+                                        .transpose()?,
+                                );
                             }
                         }
-                        structure::PatternPart::Token(token) => {
-                            disambig.push(token.postag.as_ref().map(|x| {
-                                either::Right(parse_pos_filter(
-                                    x,
-                                    token.postag_regexp.as_deref(),
-                                    info,
-                                ))
-                            }))
-                        }
-                        structure::PatternPart::And(tokens)
-                        | structure::PatternPart::Or(tokens) => {
-                            disambig.push(tokens.tokens[0].postag.as_ref().map(|x| {
-                                either::Right(parse_pos_filter(
-                                    x,
-                                    tokens.tokens[0].postag_regexp.as_deref(),
-                                    info,
-                                ))
-                            }))
+                        super::PatternPart::Token(token) => disambig.push(
+                            token
+                                .postag
+                                .as_ref()
+                                .map(|x| {
+                                    parse_pos_filter(x, token.postag_regexp.as_deref(), info)
+                                        .map(either::Right)
+                                })
+                                .transpose()?,
+                        ),
+                        super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => {
+                            disambig.push(
+                                tokens.tokens[0]
+                                    .postag
+                                    .as_ref()
+                                    .map(|x| {
+                                        parse_pos_filter(
+                                            x,
+                                            tokens.tokens[0].postag_regexp.as_deref(),
+                                            info,
+                                        )
+                                        .map(either::Right)
+                                    })
+                                    .transpose()?,
+                            )
                         }
-                        structure::PatternPart::Feature(_) => {}
+                        super::PatternPart::Feature(_) => {}
                     }
                 }
 
@@ -1131,7 +1090,7 @@ impl DisambiguationRule {
 
                 Ok(Disambiguation::Filter(
                     disambiguations.into_iter().collect(),
-                    info.tagger().lang_options().retain_last,
+                    tagger.lang_options().retain_last,
                 ))
             }
             Some("filter") => {
@@ -1141,13 +1100,13 @@ impl DisambiguationRule {
                             postag,
                             Some("yes"),
                             info,
-                        )))],
-                        info.tagger().lang_options().retain_last,
+                        )?))],
+                        tagger.lang_options().retain_last,
                     ))
                 } else {
                     Ok(Disambiguation::Filter(
                         word_datas.into_iter().map(Some).collect(),
-                        info.tagger().lang_options().retain_last,
+                        tagger.lang_options().retain_last,
                     ))
                 }
             }
@@ -1161,36 +1120,61 @@ impl DisambiguationRule {
 
                 for part in &data.pattern.parts {
                     match part {
-                        structure::PatternPart::Marker(marker) => {
+                        super::PatternPart::Marker(marker) => {
                             has_marker = true;
                             for token in &marker.tokens {
                                 let token = match token {
-                                    structure::TokenCombination::Token(token) => token,
-                                    structure::TokenCombination::And(tokens)
-                                    | structure::TokenCombination::Or(tokens) => &tokens.tokens[0],
-                                    structure::TokenCombination::Feature(_) => continue,
+                                    super::TokenCombination::Token(token) => token,
+                                    super::TokenCombination::And(tokens)
+                                    | super::TokenCombination::Or(tokens) => &tokens.tokens[0],
+                                    super::TokenCombination::Feature(_) => continue,
                                 };
 
-                                marker_disambig.push(token.postag.as_ref().map(|x| {
-                                    parse_pos_filter(x, token.postag_regexp.as_deref(), info)
-                                }));
+                                marker_disambig.push(
+                                    token
+                                        .postag
+                                        .as_ref()
+                                        .map(|x| {
+                                            parse_pos_filter(
+                                                x,
+                                                token.postag_regexp.as_deref(),
+                                                info,
+                                            )
+                                        })
+                                        .transpose()?,
+                                );
                                 marker_mask.push(token.unify.is_some())
                             }
                         }
-                        structure::PatternPart::Token(token) => {
-                            disambig.push(token.postag.as_ref().map(|x| {
-                                parse_pos_filter(x, token.postag_regexp.as_deref(), info)
-                            }));
+                        super::PatternPart::Token(token) => {
+                            disambig.push(
+                                token
+                                    .postag
+                                    .as_ref()
+                                    .map(|x| {
+                                        parse_pos_filter(x, token.postag_regexp.as_deref(), info)
+                                    })
+                                    .transpose()?,
+                            );
                             mask.push(token.unify.is_some());
                         }
-                        structure::PatternPart::And(tokens)
-                        | structure::PatternPart::Or(tokens) => {
-                            disambig.push(tokens.tokens[0].postag.as_ref().map(|x| {
-                                parse_pos_filter(x, tokens.tokens[0].postag_regexp.as_deref(), info)
-                            }));
+                        super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => {
+                            disambig.push(
+                                tokens.tokens[0]
+                                    .postag
+                                    .as_ref()
+                                    .map(|x| {
+                                        parse_pos_filter(
+                                            x,
+                                            tokens.tokens[0].postag_regexp.as_deref(),
+                                            info,
+                                        )
+                                    })
+                                    .transpose()?,
+                            );
                             mask.push(tokens.tokens[0].unify.is_some());
                         }
-                        structure::PatternPart::Feature(_) => {}
+                        super::PatternPart::Feature(_) => {}
                     }
                 }
 
@@ -1206,15 +1190,15 @@ impl DisambiguationRule {
                 if let Some(postag) = data.disambig.postag.as_ref() {
                     Ok(Disambiguation::Filter(
                         vec![Some(either::Left(WordData::new(
-                            info.tagger.id_word("".into()),
-                            info.tagger.id_tag(postag).into_static(),
+                            tagger.id_word("".into()),
+                            tagger.id_tag(postag).into_static(),
                         )))],
-                        info.tagger().lang_options().retain_last,
+                        tagger.lang_options().retain_last,
                     ))
                 } else {
                     Ok(Disambiguation::Filter(
                         word_datas.into_iter().map(Some).collect(),
-                        info.tagger().lang_options().retain_last,
+                        tagger.lang_options().retain_last,
                     ))
                 }
             }
@@ -1253,11 +1237,11 @@ impl DisambiguationRule {
 
                 for part in &example.parts {
                     match part {
-                        structure::ExamplePart::Text(text) => {
+                        super::ExamplePart::Text(text) => {
                             texts.push(text.as_str());
                             char_length += text.chars().count();
                         }
-                        structure::ExamplePart::Marker(marker) => {
+                        super::ExamplePart::Marker(marker) => {
                             if char_span.is_some() {
                                 return Err(Error::Unexpected(
                                     "example must have one or zero markers".into(),
@@ -1330,3 +1314,160 @@ impl DisambiguationRule {
         })
     }
 }
+
+macro_rules! flatten_group {
+    ($rulegroup:expr, $category:expr) => {{
+        let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns {
+            antipatterns
+        } else {
+            Vec::new()
+        };
+
+        let group = super::Group {
+            id: $rulegroup.id,
+            default: $rulegroup.default,
+            name: $rulegroup.name,
+            n: 0,
+        };
+
+        $rulegroup
+            .rules
+            .into_iter()
+            .enumerate()
+            .map(|(i, mut rule)| {
+                if let Some(antipatterns) = &mut rule.antipatterns {
+                    antipatterns.extend(group_antipatterns.clone());
+                } else {
+                    rule.antipatterns = Some(group_antipatterns.clone());
+                }
+
+                let mut group = group.clone();
+                group.n = i;
+                (rule, Some(group), $category.clone())
+            })
+            .collect::<Vec<_>>()
+    }};
+}
+
+type GrammarRuleReading = (super::Rule, Option<super::Group>, Option<super::Category>);
+type DisambiguationRuleReading = (
+    super::DisambiguationRule,
+    Option<super::Group>,
+    Option<super::Category>,
+);
+
+pub fn read_rules<P: AsRef<std::path::Path>>(
+    path: P,
+) -> Vec<Result<GrammarRuleReading, serde_xml_rs::Error>> {
+    let file = File::open(path.as_ref()).unwrap();
+    let file = BufReader::new(file);
+
+    let sanitized = super::preprocess::sanitize(file, &["suggestion"]);
+    let rules = super::preprocess::extract_rules(sanitized.as_bytes());
+
+    let mut unifications = Vec::new();
+
+    let rules: Vec<_> = rules
+        .into_iter()
+        .map(|(xml, category)| {
+            let mut out = Vec::new();
+
+            let deseralized = super::RuleContainer::deserialize(
+                &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())),
+            );
+
+            out.extend(match deseralized {
+                Ok(rule_container) => match rule_container {
+                    super::RuleContainer::Rule(rule) => {
+                        vec![Ok((rule, None, category))]
+                    }
+                    super::RuleContainer::RuleGroup(rule_group) => {
+                        flatten_group!(rule_group, category)
+                            .into_iter()
+                            .map(Ok)
+                            .collect()
+                    }
+                    super::RuleContainer::Unification(unification) => {
+                        unifications.push(unification);
+
+                        vec![]
+                    }
+                },
+                Err(err) => vec![Err(err)],
+            });
+            out
+        })
+        .flatten()
+        .collect();
+
+    rules
+        .into_iter()
+        .map(|result| match result {
+            Ok(mut x) => {
+                x.0.unifications = Some(unifications.clone());
+
+                Ok(x)
+            }
+            Err(x) => Err(x),
+        })
+        .collect()
+}
+
+pub fn read_disambiguation_rules<P: AsRef<std::path::Path>>(
+    path: P,
+) -> Vec<Result<DisambiguationRuleReading, serde_xml_rs::Error>> {
+    let file = File::open(path.as_ref()).unwrap();
+    let file = BufReader::new(file);
+
+    let sanitized = super::preprocess::sanitize(file, &[]);
+    let rules = super::preprocess::extract_rules(sanitized.as_bytes());
+
+    let mut unifications = Vec::new();
+
+    let rules: Vec<_> = rules
+        .into_iter()
+        .map(|(xml, _)| {
+            let mut out = Vec::new();
+
+            let deseralized = super::DisambiguationRuleContainer::deserialize(
+                &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())),
+            );
+
+            let category: Option<Category> = None;
+
+            out.extend(match deseralized {
+                Ok(rule_container) => match rule_container {
+                    super::DisambiguationRuleContainer::Rule(rule) => {
+                        vec![Ok((rule, None, category))]
+                    }
+                    super::DisambiguationRuleContainer::RuleGroup(rule_group) => {
+                        flatten_group!(rule_group, category)
+                            .into_iter()
+                            .map(Ok)
+                            .collect()
+                    }
+                    super::DisambiguationRuleContainer::Unification(unification) => {
+                        unifications.push(unification);
+
+                        vec![]
+                    }
+                },
+                Err(err) => vec![Err(err)],
+            });
+            out
+        })
+        .flatten()
+        .collect();
+
+    rules
+        .into_iter()
+        .map(|result| match result {
+            Ok(mut x) => {
+                x.0.unifications = Some(unifications.clone());
+
+                Ok(x)
+            }
+            Err(x) => Err(x),
+        })
+        .collect()
+}
diff --git a/nlprule/src/components/rules/mod.rs b/nlprule/src/components/rules/mod.rs
new file mode 100644
index 0000000..78be9ea
--- /dev/null
+++ b/nlprule/src/components/rules/mod.rs
@@ -0,0 +1,338 @@
+use log::info;
+use serde::{Deserialize, Serialize};
+use std::iter::FromIterator;
+
+use crate::properties::*;
+use crate::rule::Rule;
+use crate::types::*;
+use crate::utils::parallelism::MaybeParallelRefIterator;
+use crate::{
+    properties::Transform,
+    rule::{
+        id::{Index, Selector},
+        DisambiguationRule, MatchSentence,
+    },
+    types::Sentence,
+};
+use once_cell::sync::OnceCell;
+
+use super::Component;
+
+#[cfg(feature = "compile")]
+mod compile;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct Disambiguator {
+    rules: Vec<DisambiguationRule>,
+    #[serde(skip)]
+    properties: OnceCell<PropertiesMut>,
+}
+
+impl Transform for Disambiguator {
+    fn properties(&self) -> PropertiesMut {
+        *self.properties.get_or_init(|| {
+            self.rules
+                .iter()
+                .map(|rule| rule.compute_properties())
+                .collect()
+        })
+    }
+
+    fn transform<'t>(
+        &'t self,
+        sentence: Sentence<'t>,
+    ) -> Result<Sentence<'t>, crate::properties::Error> {
+        self.disambiguate_up_to_id(sentence, None)
+    }
+
+    fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+        let mut current_rules: Vec<&DisambiguationRule> = Vec::new();
+        let mut passes = 0;
+
+        for rule in self.rules() {
+            let pipeline = tokenize::Pipeline::new((
+                &tokenizer,
+                current_rules
+                    .iter()
+                    .map(|x| (*x).clone())
+                    .collect::<Disambiguator>(),
+            ))?;
+
+            if rule.test(&pipeline).is_ok() {
+                passes += 1;
+            }
+
+            current_rules.push(rule);
+        }
+
+        info!(
+            "{0} out of {1} Disambiguation Rule tests passed.",
+            passes,
+            self.rules.len()
+        );
+
+        if passes == self.rules().len() {
+            Ok(())
+        } else {
+            Err(crate::Error::TestFailed)
+        }
+    }
+}
+
+impl Component for Disambiguator {
+    fn name() -> &'static str {
+        "disambiguator"
+    }
+}
+
+impl Disambiguator {
+    /// Gets all disambigation rules in the order they are applied.
+    pub fn rules(&self) -> &[DisambiguationRule] {
+        &self.rules
+    }
+
+    pub(crate) fn disambiguate_up_to_id<'t>(
+        &'t self,
+        mut sentence: Sentence<'t>,
+        id: Option<&Index>,
+    ) -> Result<Sentence<'t>, crate::properties::Error> {
+        let n = id.map_or(self.rules.len(), |id| {
+            self.rules.iter().position(|x| x.id == *id).unwrap()
+        });
+        let mut i = 0;
+
+        let guard = self.property_guard(&mut sentence)?;
+
+        while i < n {
+            let match_sentence = MatchSentence::new(&sentence, guard.downgrade());
+
+            let result = self.rules[i..n]
+                .maybe_par_iter()
+                .enumerate()
+                .filter_map(|(j, rule)| {
+                    let changes = rule.apply(&match_sentence);
+
+                    match changes {
+                        Ok(changes) => {
+                            if changes.is_empty() {
+                                None
+                            } else {
+                                Some(Ok((j + i, changes)))
+                            }
+                        }
+                        Err(err) => Some(Err(err)),
+                    }
+                })
+                .find_first(|_| true)
+                .transpose()?;
+
+            if let Some((index, changes)) = result {
+                self.rules[index].change(&mut sentence, changes, guard)?;
+                i = index + 1;
+            } else {
+                i = n;
+            }
+        }
+
+        Ok(sentence)
+    }
+}
+
+/// A set of grammatical error correction rules.
+#[derive(Serialize, Deserialize, Default, Clone)]
+pub struct Rules {
+    rules: Vec<Rule>,
+    #[serde(skip)]
+    properties: OnceCell<Properties>,
+}
+
+impl Component for Rules {
+    fn name() -> &'static str {
+        "rules"
+    }
+}
+
+impl Suggest for Rules {
+    fn properties(&self) -> Properties {
+        *self.properties.get_or_init(|| {
+            self.rules
+                .iter()
+                .map(|rule| rule.compute_properties())
+                .collect()
+        })
+    }
+
+    fn suggest(&self, sentence: &Sentence) -> Result<Vec<Suggestion>, crate::properties::Error> {
+        let sentence = MatchSentence::new(sentence, self.property_guard(sentence)?);
+
+        let mut output: Vec<(usize, Suggestion)> = self
+            .rules
+            .maybe_par_iter()
+            .enumerate()
+            .filter(|(_, rule)| rule.enabled())
+            .map(|(i, rule)| {
+                let mut output = Vec::new();
+
+                for suggestion in rule.apply(&sentence) {
+                    match suggestion {
+                        Ok(suggestion) => output.push((i, suggestion)),
+                        Err(err) => return Err(err),
+                    }
+                }
+
+                Ok(output)
+            })
+            .collect::<Result<Vec<Vec<_>>, crate::properties::Error>>()?
+            .into_iter()
+            .flatten()
+            .collect();
+
+        output.sort_by(|(ia, a), (ib, b)| {
+            a.span()
+                .char()
+                .start
+                .cmp(&b.span().char().start)
+                .then_with(|| ib.cmp(ia))
+        });
+
+        let mut mask = vec![false; sentence.text().chars().count()];
+
+        Ok(output
+            .into_iter()
+            .filter_map(|(_, suggestion)| {
+                let span = suggestion.span().clone().lshift(sentence.span().start());
+
+                if mask[span.char().clone()].iter().all(|x| !x) {
+                    mask[span.char().clone()].iter_mut().for_each(|x| *x = true);
+                    Some(suggestion)
+                } else {
+                    None
+                }
+            })
+            .collect())
+    }
+
+    fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+        let mut passes = 0;
+
+        for rule in self.rules() {
+            if rule.test(&tokenizer).is_ok() {
+                passes += 1;
+            };
+        }
+
+        info!(
+            "{0} out of {1} Grammar Rule tests passed.",
+            passes,
+            self.rules.len()
+        );
+
+        if passes == self.rules().len() {
+            Ok(())
+        } else {
+            Err(crate::Error::TestFailed)
+        }
+    }
+}
+
+impl Rules {
+    /// All rules ordered by priority.
+    pub fn rules(&self) -> &[Rule] {
+        &self.rules
+    }
+
+    /// All rules ordered by priority (mutable).
+    pub fn rules_mut(&mut self) -> &mut [Rule] {
+        &mut self.rules
+    }
+
+    /// Returns an iterator over all rules matching the selector.
+    pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> {
+        RulesIter {
+            inner: self.rules.iter(),
+            selector: Some(selector),
+        }
+    }
+
+    /// Returns an iterator over all rules matching the selector (mutable).
+    pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> {
+        RulesIterMut {
+            inner: self.rules.iter_mut(),
+            selector: Some(selector),
+        }
+    }
+}
+
+/// An iterator over references to rules.
+pub struct RulesIter<'a> {
+    selector: Option<&'a Selector>,
+    inner: std::slice::Iter<'a, Rule>,
+}
+
+impl<'a> Iterator for RulesIter<'a> {
+    type Item = &'a Rule;
+    fn next(&mut self) -> Option<Self::Item> {
+        let selector = self.selector.as_ref();
+
+        self.inner
+            .find(|rule| selector.map_or(true, |s| s.is_match(rule.id())))
+    }
+}
+
+/// An iterator over mutable references to rules.
+pub struct RulesIterMut<'a> {
+    selector: Option<&'a Selector>,
+    inner: std::slice::IterMut<'a, Rule>,
+}
+
+impl<'a> Iterator for RulesIterMut<'a> {
+    type Item = &'a mut Rule;
+    fn next(&mut self) -> Option<Self::Item> {
+        let selector = self.selector.as_ref();
+
+        self.inner
+            .find(|rule| selector.map_or(true, |s| s.is_match(rule.id())))
+    }
+}
+
+impl IntoIterator for Rules {
+    type Item = Rule;
+    type IntoIter = std::vec::IntoIter<Rule>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.rules.into_iter()
+    }
+}
+
+impl<R> FromIterator<R> for Rules
+where
+    R: Into<Rule>,
+{
+    fn from_iter<I: IntoIterator<Item = R>>(iter: I) -> Self {
+        let rules: Vec<Rule> = iter.into_iter().map(|x| x.into()).collect();
+        Self {
+            rules,
+            properties: OnceCell::default(),
+        }
+    }
+}
+
+impl IntoIterator for Disambiguator {
+    type Item = DisambiguationRule;
+    type IntoIter = std::vec::IntoIter<DisambiguationRule>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.rules.into_iter()
+    }
+}
+
+impl<R> FromIterator<R> for Disambiguator
+where
+    R: Into<DisambiguationRule>,
+{
+    fn from_iter<I: IntoIterator<Item = R>>(iter: I) -> Self {
+        let rules: Vec<DisambiguationRule> = iter.into_iter().map(|x| x.into()).collect();
+        Self {
+            rules,
+            properties: OnceCell::default(),
+        }
+    }
+}
diff --git a/nlprule/src/components/tagger/compile.rs b/nlprule/src/components/tagger/compile.rs
new file mode 100644
index 0000000..0e4dd94
--- /dev/null
+++ b/nlprule/src/components/tagger/compile.rs
@@ -0,0 +1,165 @@
+use fs_err as fs;
+use fs_err::File;
+
+use crate::compile::{BuildComponent, BuildInfo, Error};
+use crate::components::tagger::TaggerLangOptions;
+
+use super::*;
+use serde::Deserialize;
+use std::{
+    collections::HashSet,
+    io::{BufRead, BufReader},
+    path::{Path, PathBuf},
+};
+
+fn get_lines<S1: AsRef<Path>, S2: AsRef<Path>>(
+    paths: &[S1],
+    remove_paths: &[S2],
+) -> std::io::Result<Vec<(String, String, String)>> {
+    let mut output = Vec::new();
+    let mut disallowed: Vec<String> = Vec::new();
+
+    for path in remove_paths {
+        let file = File::open(path.as_ref())?;
+        let reader = std::io::BufReader::new(file);
+
+        for line in reader.lines() {
+            let line = line?;
+            if line.starts_with('#') {
+                continue;
+            }
+
+            disallowed.push(line.to_string());
+        }
+    }
+
+    for path in paths {
+        let file = File::open(path.as_ref())?;
+        let reader = std::io::BufReader::new(file);
+
+        for line in reader.lines() {
+            let line = line?;
+            if line.starts_with('#') {
+                continue;
+            }
+
+            if disallowed.contains(&line) {
+                continue;
+            }
+
+            let parts: Vec<_> = line.split('\t').collect();
+
+            let word = parts[0].to_string();
+            let inflection = parts[1].to_string();
+            let tag = parts[2].to_string();
+
+            output.push((word, inflection, tag))
+        }
+    }
+
+    Ok(output)
+}
+
+#[derive(Deserialize)]
+pub struct Paths {
+    tag_dict: Vec<PathBuf>,
+    tag_remove_dict: Vec<PathBuf>,
+    common_words: PathBuf,
+    tagger_options: PathBuf,
+}
+
+impl BuildComponent for Tagger {
+    type Paths = Paths;
+
+    /// TODO: move and update
+    /// Creates a tagger from raw files.
+    ///
+    /// # Arguments
+    /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively,
+    /// separated by tabs, to be added to the tagger.
+    /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively,
+    /// separated by tabs, to be removed from the tagger if present in the files from `paths`.
+    fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result<Self, Error> {
+        let options: TaggerLangOptions =
+            serde_json::from_reader(BufReader::new(File::open(&paths.tagger_options)?))?;
+        let common_words: HashSet<String> = fs::read_to_string(paths.common_words)?
+            .lines()
+            .map(ToOwned::to_owned)
+            .collect();
+
+        let mut tag_store = HashSet::new();
+        let mut word_store = HashSet::new();
+
+        // add language specific special tags
+        tag_store.extend(options.extra_tags.iter().map(|x| x.as_str()));
+
+        let lines = get_lines(&paths.tag_dict, &paths.tag_remove_dict)?;
+
+        let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~";
+        for i in 0..punct.len() {
+            word_store.insert(&punct[i..(i + 1)]);
+        }
+
+        word_store.extend(common_words.iter().map(|x| x.as_str()));
+
+        for (word, inflection, tag) in lines.iter() {
+            word_store.insert(word);
+            word_store.insert(inflection);
+            tag_store.insert(tag);
+        }
+
+        // the empty string must not be part of any wordlist
+        assert!(!word_store.contains(""));
+
+        // word store ids should be consistent across runs
+        let mut word_store: Vec<_> = word_store.into_iter().collect();
+        word_store.sort_unstable();
+
+        // add special empty string to wordlist, must be the first element to have id 0
+        word_store.insert(0, "");
+
+        // tag store ids should be consistent across runs
+        let mut tag_store: Vec<_> = tag_store.into_iter().collect();
+        tag_store.sort_unstable();
+
+        // add special part of speech tags, they must have ids starting from zero
+        for (i, special_pos) in SpecialPos::iter().enumerate() {
+            tag_store.insert(i, special_pos);
+        }
+
+        let word_store: BiMap<_, _> = word_store
+            .iter()
+            .enumerate()
+            .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32)))
+            .collect();
+        let tag_store: BiMap<_, _> = tag_store
+            .iter()
+            .enumerate()
+            .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
+            .collect();
+
+        let mut tags: Vec<Option<Vec<(WordIdInt, PosIdInt)>>> = vec![None; word_store.len()];
+
+        for (word, inflection, tag) in lines.iter() {
+            let word_id = word_store.get_by_left(word).unwrap();
+            let lemma_id = word_store.get_by_left(inflection).unwrap();
+            let pos_id = tag_store.get_by_left(tag).unwrap();
+
+            match &mut tags[word_id.value() as usize] {
+                Some(vec) => {
+                    vec.push((*lemma_id, *pos_id));
+                }
+                None => {
+                    tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]);
+                }
+            }
+        }
+
+        Ok(Tagger {
+            tags: WordIdMap(tags),
+            word_store,
+            tag_store,
+            lang_options: options,
+        })
+    }
+}
diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/components/tagger/mod.rs
similarity index 92%
rename from nlprule/src/tokenizer/tag.rs
rename to nlprule/src/components/tagger/mod.rs
index c1ad8fc..4a88c0c 100644
--- a/nlprule/src/tokenizer/tag.rs
+++ b/nlprule/src/components/tagger/mod.rs
@@ -1,6 +1,6 @@
 //! A dictionary-based tagger.
 
-use crate::{types::*, utils::parallelism::MaybeParallelRefIterator};
+use crate::{properties::*, types::*, utils::parallelism::MaybeParallelRefIterator};
 use bimap::BiMap;
 use fst::{IntoStreamer, Map, Streamer};
 use log::error;
@@ -12,6 +12,9 @@ use std::{
     iter::{once, FusedIterator},
 };
 
+#[cfg(feature = "compile")]
+mod compile;
+
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(transparent)]
 pub(crate) struct WordIdInt(u32);
@@ -58,6 +61,12 @@ impl<'t> fmt::Debug for WordId<'t> {
     }
 }
 
+impl<'t> Default for WordId<'t> {
+    fn default() -> Self {
+        WordId::empty()
+    }
+}
+
 impl<'t> WordId<'t> {
     pub(crate) fn id(&self) -> &Option<WordIdInt> {
         &self.1
@@ -72,6 +81,13 @@ impl<'t> WordId<'t> {
         self.0.as_ref()
     }
 
+    pub fn as_ref_str(&self) -> &'t str {
+        match &self.0 {
+            Cow::Borrowed(x) => *x,
+            Cow::Owned(_) => panic!("can not get `&'t str` reference from owned Cow!"),
+        }
+    }
+
     /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data.
     pub fn into_static(self) -> WordId<'static> {
         WordId(self.0.into_owned().into(), self.1)
@@ -397,11 +413,9 @@ impl<T: Clone + Default> WordIdMap<T> {
             .iter()
             .enumerate()
             .filter_map(|(index, maybe_value)| {
-                if let Some(value) = maybe_value {
-                    Some((WordIdInt(index as u32), value))
-                } else {
-                    None
-                }
+                maybe_value
+                    .as_ref()
+                    .map(|value| (WordIdInt(index as u32), value))
             })
     }
 }
@@ -534,10 +548,10 @@ impl<'a> ExactSizeIterator for TagIter<'a> {
 #[derive(Default, Serialize, Deserialize, Clone)]
 #[serde(from = "TaggerFields", into = "TaggerFields")]
 pub struct Tagger {
-    pub(crate) tags: WordIdMap<Vec<(WordIdInt, PosIdInt)>>,
-    pub(crate) tag_store: BiMap<String, PosIdInt>,
-    pub(crate) word_store: BiMap<String, WordIdInt>,
-    pub(crate) lang_options: TaggerLangOptions,
+    tags: WordIdMap<Vec<(WordIdInt, PosIdInt)>>,
+    tag_store: BiMap<String, PosIdInt>,
+    word_store: BiMap<String, WordIdInt>,
+    lang_options: TaggerLangOptions,
 }
 
 impl Tagger {
@@ -725,4 +739,42 @@ impl Tagger {
     pub fn get_tags<'a>(&'a self, word: &'a str) -> TagIter<'a> {
         self.get_tags_with_options(word, None, None)
     }
+
+    pub fn transform<'t>(
+        &'t self,
+        mut sentence: Sentence<'t>,
+        guard: PropertyGuardMut,
+    ) -> Result<Sentence<'t>, crate::properties::Error> {
+        for token in sentence.iter_mut() {
+            let mut tag_vec: Vec<_> = self
+                .get_tags_with_options(
+                    token.as_str(),
+                    if token.is_sentence_start() {
+                        Some(true)
+                    } else {
+                        None
+                    },
+                    None,
+                )
+                .collect();
+
+            tag_vec.push(
+                WordData::new(
+                    self.id_word(token.as_str().into()),
+                    PosId::special(SpecialPos::None),
+                )
+                .freeze(),
+            );
+
+            if token.is_sentence_end() {
+                tag_vec.push(
+                    WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(),
+                );
+            }
+
+            *guard.tags_mut(token)? = Tags::new(self.id_word(token.as_str().into()), tag_vec);
+        }
+
+        Ok(sentence)
+    }
 }
diff --git a/nlprule/src/components/tokenizer/compile.rs b/nlprule/src/components/tokenizer/compile.rs
new file mode 100644
index 0000000..63bb36c
--- /dev/null
+++ b/nlprule/src/components/tokenizer/compile.rs
@@ -0,0 +1,45 @@
+use fs_err as fs;
+use fs_err::File;
+use std::{io::BufReader, path::PathBuf, str::FromStr};
+
+use crate::compile::{BuildComponent, BuildInfo, Error};
+
+use super::*;
+
+#[derive(Deserialize)]
+pub struct Paths {
+    tokenizer_options: PathBuf,
+    srx: PathBuf,
+    lang_code: PathBuf,
+}
+
+impl BuildComponent for Tokenizer {
+    type Paths = Paths;
+
+    fn build(paths: Paths, build_info: Option<&mut BuildInfo>) -> Result<Self, Error> {
+        let build_info = build_info.ok_or(Error::BuildInfoUnset)?;
+
+        let options: TokenizerLangOptions =
+            serde_json::from_reader(BufReader::new(File::open(&paths.tokenizer_options)?))?;
+        let lang_code = fs::read_to_string(paths.lang_code)?;
+
+        let sentencizer =
+            srx::SRX::from_str(&fs::read_to_string(&paths.srx)?)?.language_rules(lang_code);
+
+        let mut whitelist = DefaultHashSet::new();
+
+        for (word, _) in build_info.tagger().word_store() {
+            if word.contains(|c| options.extra_split_chars.contains(&c)) {
+                whitelist.insert(word.to_owned());
+            }
+        }
+
+        Ok(Tokenizer {
+            tagger: build_info.tagger().clone(),
+            sentencizer,
+            lang_options: options,
+            whitelist,
+            properties: Default::default(),
+        })
+    }
+}
diff --git a/nlprule/src/components/tokenizer/mod.rs b/nlprule/src/components/tokenizer/mod.rs
new file mode 100644
index 0000000..3ce20ff
--- /dev/null
+++ b/nlprule/src/components/tokenizer/mod.rs
@@ -0,0 +1,238 @@
+//! A tokenizer to split raw text into tokens.
+//! Tokens are assigned lemmas and part-of-speech tags by lookup from a [Tagger][tag::Tagger] and chunks containing
+//! information about noun / verb and grammatical case by a statistical [Chunker][chunk::Chunker].
+//! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by
+//! [DisambiguationRule][crate::rule::DisambiguationRule]s.
+
+#[cfg(feature = "compile")]
+mod compile;
+
+use std::ops::Range;
+
+use crate::types::*;
+use crate::{properties::*, utils::regex::Regex};
+use lazy_static::lazy_static;
+use once_cell::sync::OnceCell;
+use serde::{Deserialize, Serialize};
+
+use super::{tagger::Tagger, Component};
+
+/// Split a text at the points where the given function is true.
+/// Keeps the separators. See https://stackoverflow.com/a/40296745.
+fn split<F>(text: &str, split_func: F) -> Vec<&str>
+where
+    F: Fn(char) -> bool,
+{
+    let mut result = Vec::new();
+    let mut last = 0;
+    for (index, matched) in text.match_indices(split_func) {
+        if last != index {
+            result.push(&text[last..index]);
+        }
+        result.push(matched);
+        last = index + matched.len();
+    }
+    if last < text.len() {
+        result.push(&text[last..]);
+    }
+
+    result
+}
+
+/// Options for a tokenizer.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub(crate) struct TokenizerLangOptions {
+    /// Extra language-specific characters to split text on.
+    #[serde(default)]
+    pub extra_split_chars: Vec<char>,
+    /// Extra language-specific Regexes of which the matches will *not* be split into multiple tokens.
+    #[serde(default)]
+    pub extra_join_regexes: Vec<Regex>,
+}
+
+/// The complete Tokenizer doing tagging, chunking and disambiguation.
+#[derive(Serialize, Deserialize, Default, Clone)]
+pub struct Tokenizer {
+    whitelist: DefaultHashSet<String>,
+    sentencizer: srx::Rules,
+    tagger: Tagger,
+    lang_options: TokenizerLangOptions,
+    #[serde(skip)]
+    properties: OnceCell<PropertiesMut>,
+}
+
+impl Tokenize for Tokenizer {
+    fn properties(&self) -> PropertiesMut {
+        lazy_static! {
+            static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]);
+        }
+        *PROPERTIES
+    }
+
+    fn tokenize<'t>(&'t self, text: &'t str) -> Box<dyn Iterator<Item = Sentence<'t>> + 't> {
+        Box::new(SentenceIter {
+            text,
+            splits: self.sentencizer.split_ranges(text),
+            tokenizer: &self,
+            index: 0,
+            position: Position::default(),
+        })
+    }
+
+    fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option<Sentence<'t>> {
+        if sentence.trim().is_empty() {
+            return None;
+        }
+
+        let token_strs = self
+            .get_token_ranges(sentence)
+            .filter(|range| !sentence[range.clone()].trim().is_empty());
+
+        let n_token_strs = token_strs.clone().count();
+
+        let tokens: Vec<_> = token_strs
+            .enumerate()
+            .map(|(i, range)| {
+                let byte_start = range.start;
+                let char_start = sentence[..byte_start].chars().count();
+
+                let token_text = sentence[range].trim();
+
+                let is_sentence_start = i == 0;
+                let is_sentence_end = i == n_token_strs - 1;
+
+                Token::new(
+                    token_text,
+                    Span::new(
+                        byte_start..byte_start + token_text.len(),
+                        char_start..char_start + token_text.chars().count(),
+                    ),
+                    is_sentence_start,
+                    is_sentence_end,
+                    sentence[..byte_start].ends_with(char::is_whitespace),
+                )
+            })
+            .collect();
+
+        let mut sentence = Sentence::new(tokens, sentence, &self.tagger);
+        let guard = self.property_guard(&mut sentence).expect("TODO");
+
+        sentence = self.tagger.transform(sentence, guard).expect("TODO");
+
+        Some(sentence)
+    }
+}
+
+/// An iterator over sentences. Has some key properties:
+/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero.
+/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`.
+/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence.
+pub struct SentenceIter<'t> {
+    text: &'t str,
+    splits: Vec<Range<usize>>,
+    tokenizer: &'t Tokenizer,
+    index: usize,
+    position: Position,
+}
+
+impl<'t> Iterator for SentenceIter<'t> {
+    type Item = Sentence<'t>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index == self.splits.len() {
+            return None;
+        }
+
+        let mut range = self.splits[self.index].clone();
+        self.index += 1;
+
+        // as long as the current sentence contains only whitespace, add the next sentence
+        // in practice, this might never happen, but we can not make any assumption about
+        // SRX rule behavior here.
+        while self.text[range.clone()].trim().is_empty() && self.index < self.splits.len() {
+            range.end = self.splits[self.index].end;
+            self.index += 1;
+        }
+
+        let sentence = self
+            .tokenizer
+            .tokenize_sentence(&self.text[range.clone()])
+            .map(|x| x.rshift(self.position));
+
+        self.position += Position {
+            char: self.text[range.clone()].chars().count(),
+            byte: range.len(),
+        };
+
+        sentence
+    }
+}
+
+impl Component for Tokenizer {
+    fn name() -> &'static str {
+        "tokenizer"
+    }
+}
+
+impl Tokenizer {
+    /// Gets the lexical tagger.
+    pub fn tagger(&self) -> &Tagger {
+        &self.tagger
+    }
+
+    fn get_token_ranges<'t>(
+        &self,
+        text: &'t str,
+    ) -> impl ExactSizeIterator<Item = Range<usize>> + 't + Clone {
+        let mut tokens = Vec::new();
+
+        let split_char = |c: char| c.is_whitespace() || crate::utils::splitting_chars().contains(c);
+        let split_text = |text: &'t str| {
+            let mut tokens = Vec::new();
+            for pretoken in split(text, split_char) {
+                // if the token is in the dictionary, we add it right away
+                if self.whitelist.contains(pretoken) {
+                    tokens.push(pretoken);
+                } else {
+                    // otherwise, potentially split it again with `extra_split_chars` e. g. "-"
+                    tokens.extend(split(pretoken, |c| {
+                        split_char(c) || self.lang_options.extra_split_chars.contains(&c)
+                    }));
+                }
+            }
+            tokens
+        };
+
+        let mut joined_mask = vec![false; text.len()];
+        let mut joins = Vec::new();
+
+        for regex in self.lang_options.extra_join_regexes.iter() {
+            for mat in regex.find_iter(text) {
+                if !joined_mask[mat.start()..mat.end()].iter().any(|x| *x) {
+                    joins.push(mat.start()..mat.end());
+                    joined_mask[mat.start()..mat.end()]
+                        .iter_mut()
+                        .for_each(|x| *x = true);
+                }
+            }
+        }
+
+        joins.sort_by(|a, b| a.start.cmp(&b.start));
+
+        let mut prev = 0;
+        for range in joins {
+            tokens.extend(split_text(&text[prev..range.start]));
+            prev = range.end;
+            tokens.push(&text[range]);
+        }
+
+        tokens.extend(split_text(&text[prev..text.len()]));
+        tokens.into_iter().map(move |token| {
+            let byte_start = (token.as_ptr() as usize)
+                .checked_sub(text.as_ptr() as usize)
+                .expect("Each token str is a slice of the text str.");
+
+            byte_start..byte_start + token.len()
+        })
+    }
+}
diff --git a/nlprule/src/lang.rs b/nlprule/src/lang.rs
new file mode 100644
index 0000000..f627fef
--- /dev/null
+++ b/nlprule/src/lang.rs
@@ -0,0 +1,37 @@
+use std::path::{Path, PathBuf};
+
+const MANIFEST_DIR: &str = env!("CARGO_MANIFEST_DIR");
+
+pub fn binary_path(lang_code: &str, name: &str) -> PathBuf {
+    Path::new(MANIFEST_DIR)
+        .join(lang_code)
+        .join(format!("{}.bin", name))
+}
+
+#[allow(unused)]
+macro_rules! binary {
+    ($component: ty, $lang_code:literal, $name:literal) => {{
+        use crate::components::Component;
+
+        let mut bytes: &'static [u8] = include_bytes!(concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/storage/",
+            $lang_code,
+            "/",
+            $name,
+            ".bin"
+        ));
+
+        <$component>::from_reader(&mut bytes)
+    }};
+}
+
+#[allow(unused)]
+const ERROR_MSG: &str = "binaries are pre-tested.";
+
+#[cfg(feature = "binaries-de")]
+pub mod de;
+#[cfg(feature = "binaries-en")]
+pub mod en;
+#[cfg(feature = "binaries-es")]
+pub mod es;
diff --git a/nlprule/src/lang/de.rs b/nlprule/src/lang/de.rs
new file mode 100644
index 0000000..6e3d337
--- /dev/null
+++ b/nlprule/src/lang/de.rs
@@ -0,0 +1,31 @@
+use super::ERROR_MSG;
+use crate::{
+    components::{
+        rules::{Disambiguator, Rules},
+        tokenizer::Tokenizer,
+    },
+    properties::{tokenize, CreatePipe, Pipeline},
+};
+
+pub type Analyzer = tokenize::Pipeline<(Tokenizer, Disambiguator)>;
+pub type Correcter = Pipeline<(Analyzer, Rules)>;
+
+pub fn tokenizer() -> Tokenizer {
+    binary!(Tokenizer, "de", "tokenizer").expect(ERROR_MSG)
+}
+
+pub fn disambiguator() -> Disambiguator {
+    binary!(Disambiguator, "de", "disambiguator").expect(ERROR_MSG)
+}
+
+pub fn rules() -> Rules {
+    binary!(Rules, "de", "rules").expect(ERROR_MSG)
+}
+
+pub fn analyzer() -> Analyzer {
+    tokenize::Pipeline::new((tokenizer(), disambiguator())).expect(ERROR_MSG)
+}
+
+pub fn correcter() -> Correcter {
+    Pipeline::new((analyzer(), rules())).expect(ERROR_MSG)
+}
diff --git a/nlprule/src/lang/en.rs b/nlprule/src/lang/en.rs
new file mode 100644
index 0000000..5dfd743
--- /dev/null
+++ b/nlprule/src/lang/en.rs
@@ -0,0 +1,42 @@
+use super::ERROR_MSG;
+use crate::{
+    components::{
+        chunker::Chunker,
+        multiword_tagger::MultiwordTagger,
+        rules::{Disambiguator, Rules},
+        tokenizer::Tokenizer,
+    },
+    properties::{tokenize, CreatePipe, Pipeline},
+};
+
+pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Chunker, Disambiguator)>;
+pub type Correcter = Pipeline<(Analyzer, Rules)>;
+
+pub fn tokenizer() -> Tokenizer {
+    binary!(Tokenizer, "en", "tokenizer").expect(ERROR_MSG)
+}
+
+pub fn multiword_tagger() -> MultiwordTagger {
+    binary!(MultiwordTagger, "en", "tokenizer").expect(ERROR_MSG)
+}
+
+pub fn chunker() -> Chunker {
+    binary!(Chunker, "en", "chunker").expect(ERROR_MSG)
+}
+
+pub fn disambiguator() -> Disambiguator {
+    binary!(Disambiguator, "en", "disambiguator").expect(ERROR_MSG)
+}
+
+pub fn rules() -> Rules {
+    binary!(Rules, "en", "rules").expect(ERROR_MSG)
+}
+
+pub fn analyzer() -> Analyzer {
+    tokenize::Pipeline::new((tokenizer(), multiword_tagger(), chunker(), disambiguator()))
+        .expect(ERROR_MSG)
+}
+
+pub fn correcter() -> Correcter {
+    Pipeline::new((analyzer(), rules())).expect(ERROR_MSG)
+}
diff --git a/nlprule/src/lang/es.rs b/nlprule/src/lang/es.rs
new file mode 100644
index 0000000..5f4194a
--- /dev/null
+++ b/nlprule/src/lang/es.rs
@@ -0,0 +1,36 @@
+use super::ERROR_MSG;
+use crate::{
+    components::{
+        multiword_tagger::MultiwordTagger,
+        rules::{Disambiguator, Rules},
+        tokenizer::Tokenizer,
+    },
+    properties::{tokenize, CreatePipe, Pipeline},
+};
+
+pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Disambiguator)>;
+pub type Correcter = Pipeline<(Analyzer, Rules)>;
+
+pub fn tokenizer() -> Tokenizer {
+    binary!(Tokenizer, "es", "tokenizer").expect(ERROR_MSG)
+}
+
+pub fn multiword_tagger() -> MultiwordTagger {
+    binary!(MultiwordTagger, "es", "multiword_tagger").expect(ERROR_MSG)
+}
+
+pub fn disambiguator() -> Disambiguator {
+    binary!(Disambiguator, "es", "disambiguator").expect(ERROR_MSG)
+}
+
+pub fn rules() -> Rules {
+    binary!(Rules, "es", "rules").expect(ERROR_MSG)
+}
+
+pub fn analyzer() -> Analyzer {
+    tokenize::Pipeline::new((tokenizer(), multiword_tagger(), disambiguator())).expect(ERROR_MSG)
+}
+
+pub fn correcter() -> Correcter {
+    Pipeline::new((analyzer(), rules())).expect(ERROR_MSG)
+}
diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs
index 585c591..4e4edf4 100644
--- a/nlprule/src/lib.rs
+++ b/nlprule/src/lib.rs
@@ -10,13 +10,12 @@
 //! Correct a text:
 //!
 //! ```no_run
-//! use nlprule::{Tokenizer, Rules};
+//! use nlprule::lang::en;
 //!
-//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?;
-//! let rules = Rules::new("path/to/en_rules.bin")?;
+//! let correcter = en::correcter();
 //!
 //! assert_eq!(
-//!     rules.correct("She was not been here since Monday.", &tokenizer),
+//!     correcter.correct("She was not been here since Monday.").collect::<Vec<String>>().join(""),
 //!     String::from("She was not here since Monday.")
 //! );
 //! # Ok::<(), nlprule::Error>(())
@@ -25,64 +24,58 @@
 //! Get suggestions and correct a text:
 //!
 //! ```no_run
-//! use nlprule::{Tokenizer, Rules, types::Suggestion, rules::apply_suggestions};
+//! use nlprule::lang::en;
 //!
-//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?;
-//! let rules = Rules::new("path/to/en_rules.bin")?;
+//! let correcter = en::correcter();
 //!
 //! let text = "She was not been here since Monday.";
 //!
-//! let suggestions = rules.suggest(text, &tokenizer);
+//! let suggestions = correcter.suggest(text).next().expect("`text` contains one sentence.");
 //! assert_eq!(*suggestions[0].span().char(), 4usize..16);
 //! assert_eq!(suggestions[0].replacements(), vec!["was not", "has not been"]);
 //! assert_eq!(suggestions[0].source(), "GRAMMAR/WAS_BEEN/1");
 //! assert_eq!(suggestions[0].message(), "Did you mean was not or has not been?");
 //!
-//! let corrected = apply_suggestions(text, &suggestions);
-//!
-//! assert_eq!(corrected, "She was not here since Monday.");
 //! # Ok::<(), nlprule::Error>(())
 //! ```
 //!
 //! Tokenize & analyze a text:
 //!
 //! ```no_run
-//! use nlprule::Tokenizer;
+//! use nlprule::lang::en;
+//! use nlprule::properties::Tokenize;
 //!
-//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?;
+//! let analyzer = en::analyzer();
 //!
 //! let text = "A brief example is shown.";
 //!
 //! // returns an iterator over sentences
-//! let sentence = tokenizer.pipe(text).next().expect("`text` contains one sentence.");
+//! let sentence = analyzer.tokenize(text).next().expect("`text` contains one sentence.");
 //!
 //! println!("{:#?}", sentence);
-//! assert_eq!(sentence.tokens()[1].word().text().as_str(), "brief");
-//! assert_eq!(sentence.tokens()[1].word().tags()[0].pos().as_str(), "JJ");
-//! assert_eq!(sentence.tokens()[1].chunks(), vec!["I-NP-singular"]);
+//! assert_eq!(sentence.tokens()[1].as_str(), "brief");
+//! assert_eq!(sentence.tokens()[1].tags()?.iter().next().unwrap().pos().as_str(), "JJ");
+//! assert_eq!(sentence.tokens()[1].chunks()?, &["I-NP-singular"]);
 //! // some other information like char / byte span, lemmas etc. is also set!
 //! # Ok::<(), nlprule::Error>(())
 //! ```
-//! ---
-//! Binaries are distributed with [Github releases](https://github.com/bminixhofer/nlprule/releases).
 
-#![warn(missing_docs)]
+// #![warn(missing_docs)]
 use std::io;
 
 use thiserror::Error;
 
 #[cfg(feature = "compile")]
 pub mod compile;
+pub mod components;
 mod filter;
+#[macro_use]
+pub mod lang;
+pub mod properties;
 pub mod rule;
-pub mod rules;
-pub mod tokenizer;
 pub mod types;
 pub(crate) mod utils;
 
-pub use rules::Rules;
-pub use tokenizer::Tokenizer;
-
 #[derive(Error, Debug)]
 #[allow(missing_docs)]
 pub enum Error {
@@ -93,30 +86,8 @@ pub enum Error {
     Serialization(#[from] bincode::Error),
     #[error(transparent)]
     IdError(#[from] rule::id::Error),
-}
-
-/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format.
-pub fn tokenizer_filename(lang_code: &str) -> String {
-    format!("{}_tokenizer.bin", lang_code)
-}
-
-/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format.
-pub fn rules_filename(lang_code: &str) -> String {
-    format!("{}_rules.bin", lang_code)
-}
-
-/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format.
-#[macro_export]
-macro_rules! tokenizer_filename {
-    ($lang_code:literal) => {
-        concat!($lang_code, "_tokenizer.bin")
-    };
-}
-
-/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format.
-#[macro_export]
-macro_rules! rules_filename {
-    ($lang_code:literal) => {
-        concat!($lang_code, "_rules.bin")
-    };
+    #[error(transparent)]
+    Property(#[from] properties::Error),
+    #[error("Test failed. See logs for details.")]
+    TestFailed,
 }
diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs
new file mode 100644
index 0000000..de151e0
--- /dev/null
+++ b/nlprule/src/properties.rs
@@ -0,0 +1,687 @@
+use serde::{Deserialize, Serialize};
+
+use crate::types::*;
+use thiserror::Error;
+
+pub use suggest::Suggest;
+pub use tokenize::Tokenize;
+pub use transform::Transform;
+
+pub mod suggest {
+    use super::*;
+
+    /// Correct a text by applying suggestions to it.
+    /// In the case of multiple possible replacements, always chooses the first one.
+    pub fn apply_suggestions(sentence: &Sentence, suggestions: &[Suggestion]) -> String {
+        let mut offset: isize = -(sentence.span().char().start as isize);
+        let mut chars: Vec<_> = sentence.text().chars().collect();
+
+        for suggestion in suggestions {
+            let replacement: Vec<_> = suggestion.replacements()[0].chars().collect();
+            chars.splice(
+                (suggestion.span().char().start as isize + offset) as usize
+                    ..(suggestion.span().char().end as isize + offset) as usize,
+                replacement.iter().cloned(),
+            );
+            offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize;
+        }
+
+        chars.into_iter().collect()
+    }
+
+    pub trait Suggest {
+        fn properties(&self) -> Properties {
+            Properties::default()
+        }
+
+        fn property_guard(&self, sentence: &Sentence) -> Result<PropertyGuard, Error> {
+            self.properties().build(sentence)
+        }
+
+        fn suggest(&self, sentence: &Sentence) -> Result<Vec<Suggestion>, Error>;
+
+        fn correct(&self, sentence: &Sentence) -> Result<String, Error> {
+            let suggestions = self.suggest(sentence)?;
+            Ok(apply_suggestions(&sentence, &suggestions))
+        }
+
+        #[allow(unused_variables)]
+        fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+            Ok(())
+        }
+    }
+
+    impl<'a, T> Suggest for &'a T
+    where
+        T: Suggest,
+    {
+        fn properties(&self) -> Properties {
+            (*self).properties()
+        }
+
+        fn property_guard(&self, sentence: &Sentence) -> Result<PropertyGuard, Error> {
+            (*self).property_guard(sentence)
+        }
+
+        fn suggest(&self, sentence: &Sentence) -> Result<Vec<Suggestion>, Error> {
+            (*self).suggest(sentence)
+        }
+
+        fn correct(&self, sentence: &Sentence) -> Result<String, Error> {
+            (*self).correct(sentence)
+        }
+
+        fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+            (*self).test(tokenizer)
+        }
+    }
+}
+
+pub mod transform {
+    use super::*;
+
+    pub trait Transform {
+        fn properties(&self) -> PropertiesMut {
+            PropertiesMut::default()
+        }
+
+        fn property_guard(&self, sentence: &mut Sentence) -> Result<PropertyGuardMut, Error> {
+            self.properties().build(sentence)
+        }
+
+        fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result<Sentence<'t>, Error>;
+
+        #[allow(unused_variables)]
+        fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+            Ok(())
+        }
+    }
+
+    impl<'a, T> Transform for &'a T
+    where
+        T: Transform,
+    {
+        fn properties(&self) -> PropertiesMut {
+            (*self).properties()
+        }
+
+        fn property_guard(&self, sentence: &mut Sentence) -> Result<PropertyGuardMut, Error> {
+            (*self).property_guard(sentence)
+        }
+
+        fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result<Sentence<'t>, Error> {
+            (*self).transform(sentence)
+        }
+
+        fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+            (*self).test(tokenizer)
+        }
+    }
+
+    #[derive(Serialize, Deserialize, Clone)]
+    pub struct Pipeline<T>(pub(super) T, pub(super) PropertiesMut);
+}
+
+pub mod tokenize {
+    use super::*;
+
+    pub trait Tokenize {
+        fn properties(&self) -> PropertiesMut {
+            PropertiesMut::default()
+        }
+
+        fn property_guard(&self, sentence: &mut Sentence) -> Result<PropertyGuardMut, Error> {
+            self.properties().build(sentence)
+        }
+
+        fn tokenize<'t>(&'t self, text: &'t str) -> Box<dyn Iterator<Item = Sentence<'t>> + 't>;
+
+        fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option<Sentence<'t>>;
+
+        fn test(&self) -> Result<(), crate::Error> {
+            Ok(())
+        }
+    }
+
+    impl<'a, T> Tokenize for &'a T
+    where
+        T: Tokenize,
+    {
+        fn properties(&self) -> PropertiesMut {
+            (*self).properties()
+        }
+
+        fn property_guard(&self, sentence: &mut Sentence) -> Result<PropertyGuardMut, Error> {
+            (*self).property_guard(sentence)
+        }
+
+        fn tokenize<'t>(&'t self, text: &'t str) -> Box<dyn Iterator<Item = Sentence<'t>> + 't> {
+            (*self).tokenize(text)
+        }
+
+        fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option<Sentence<'t>> {
+            (*self).tokenize_sentence(sentence)
+        }
+
+        fn test(&self) -> Result<(), crate::Error> {
+            (*self).test()
+        }
+    }
+
+    #[derive(Serialize, Deserialize, Clone)]
+    pub struct Pipeline<T>(pub(super) T, pub(super) PropertiesMut);
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct Pipeline<T>(T, PropertiesMut);
+
+impl<T> transform::Pipeline<T> {
+    pub fn components(&self) -> &T {
+        &self.0
+    }
+
+    pub fn components_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+impl<T> tokenize::Pipeline<T> {
+    pub fn components(&self) -> &T {
+        &self.0
+    }
+
+    pub fn components_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+impl<T> Pipeline<T> {
+    pub fn components(&self) -> &T {
+        &self.0
+    }
+
+    pub fn components_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+#[derive(Error, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[error("unset token property: {0:?}.")]
+    Unset(Property),
+    #[error("invalid pipeline: properties {0:?} are read without being written.")]
+    InvalidPipeline(Vec<Property>),
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum Property {
+    Tags = 0,
+    Chunks = 1,
+}
+
+impl Property {
+    pub fn properties() -> &'static [Property] {
+        &[Property::Tags, Property::Chunks]
+    }
+}
+
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, Default)]
+struct Bitset(u16);
+
+impl Bitset {
+    pub fn insert(&mut self, value: Property) {
+        self.0 |= 1 << (value as u16);
+    }
+
+    pub fn contains(&self, value: &Property) -> bool {
+        self.0 & (1 << (*value as u16)) != 0
+    }
+
+    pub fn union(mut self, other: Bitset) -> Self {
+        self.0 |= other.0;
+        self
+    }
+
+    pub fn intersection(mut self, other: Bitset) -> Self {
+        self.0 &= other.0;
+        self
+    }
+
+    pub fn inverse(mut self) -> Self {
+        self.0 = !self.0;
+        self
+    }
+
+    pub fn into_iter<'a>(self) -> impl Iterator<Item = Property> + 'a {
+        Property::properties().iter().filter_map(move |property| {
+            if self.contains(property) {
+                Some(*property)
+            } else {
+                None
+            }
+        })
+    }
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
+pub struct Properties {
+    read_mask: Bitset,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
+pub struct PropertiesMut {
+    read_mask: Bitset,
+    write_mask: Bitset,
+}
+
+impl std::iter::FromIterator<Properties> for Properties {
+    fn from_iter<T: IntoIterator<Item = Properties>>(iter: T) -> Self {
+        let mut out = Properties::default();
+
+        for properties in iter {
+            out = out.union(properties)
+        }
+
+        out
+    }
+}
+
+impl std::iter::FromIterator<PropertiesMut> for PropertiesMut {
+    fn from_iter<T: IntoIterator<Item = PropertiesMut>>(iter: T) -> Self {
+        let mut out = PropertiesMut::default();
+
+        for properties in iter {
+            out = out.union(properties)
+        }
+
+        out
+    }
+}
+
+impl Properties {
+    pub fn read(mut self, properties: &[Property]) -> Self {
+        for property in properties {
+            self.read_mask.insert(*property);
+        }
+
+        self
+    }
+
+    pub fn write(self, properties: &[Property]) -> PropertiesMut {
+        let mut write_mask = Bitset::default();
+        let mut read_mask = self.read_mask;
+
+        for property in properties {
+            // write implies read
+            read_mask.insert(*property);
+            write_mask.insert(*property);
+        }
+
+        PropertiesMut {
+            read_mask,
+            write_mask,
+        }
+    }
+
+    pub fn union(mut self, properties: Properties) -> Self {
+        self.read_mask = self.read_mask.union(properties.read_mask);
+
+        self
+    }
+
+    pub fn build(&self, sentence: &Sentence) -> Result<PropertyGuard, Error> {
+        for property in Property::properties() {
+            if self.read_mask.contains(property) {
+                match *property {
+                    Property::Tags => {
+                        if sentence.first().tags.is_none() {
+                            return Err(Error::Unset(Property::Tags));
+                        }
+                    }
+                    Property::Chunks => {
+                        if sentence.first().chunks.is_none() {
+                            return Err(Error::Unset(Property::Chunks));
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(PropertyGuard {
+            read_mask: self.read_mask,
+        })
+    }
+}
+
+impl PropertiesMut {
+    pub(crate) fn reads_without_write(&self) -> impl Iterator<Item = Property> {
+        self.read_mask
+            .intersection(self.write_mask.inverse())
+            .into_iter()
+    }
+
+    pub fn union(mut self, properties: PropertiesMut) -> Self {
+        self.read_mask = self.read_mask.union(properties.read_mask);
+        self.write_mask = self.write_mask.union(properties.read_mask);
+
+        self
+    }
+
+    pub fn chain(mut self, next: PropertiesMut) -> Self {
+        let next_reads = next.read_mask.intersection(next.write_mask.inverse());
+        let new_reads = next_reads.intersection(self.write_mask.inverse());
+
+        self.read_mask = self.read_mask.union(new_reads);
+        self.write_mask = self.write_mask.union(next.write_mask);
+        self
+    }
+
+    pub fn build(&self, sentence: &mut Sentence) -> Result<PropertyGuardMut, Error> {
+        for property in Property::properties() {
+            if self.write_mask.contains(property) {
+                match property {
+                    Property::Tags => {
+                        if sentence.first().tags.is_none() {
+                            sentence
+                                .iter_mut()
+                                .for_each(|token| token.tags = Some(Tags::default()));
+                        }
+                    }
+                    Property::Chunks => {
+                        if sentence.first().chunks.is_none() {
+                            sentence
+                                .iter_mut()
+                                .for_each(|token| token.chunks = Some(Vec::default()));
+                        }
+                    }
+                }
+            }
+        }
+
+        for property in Property::properties() {
+            if self.read_mask.contains(property) {
+                match *property {
+                    Property::Tags => {
+                        if sentence.first().tags.is_none() {
+                            return Err(Error::Unset(Property::Tags));
+                        }
+                    }
+                    Property::Chunks => {
+                        if sentence.first().chunks.is_none() {
+                            return Err(Error::Unset(Property::Chunks));
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(PropertyGuardMut {
+            read_mask: self.read_mask,
+            write_mask: self.write_mask,
+        })
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct PropertyGuard {
+    read_mask: Bitset,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct PropertyGuardMut {
+    read_mask: Bitset,
+    write_mask: Bitset,
+}
+
+impl PropertyGuard {
+    pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> {
+        match (
+            token.chunks.as_deref(),
+            self.read_mask.contains(&Property::Chunks),
+        ) {
+            (Some(chunks), true) => Ok(chunks),
+            _ => Err(Error::Unset(Property::Chunks)),
+        }
+    }
+
+    pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> {
+        match (
+            token.tags.as_ref(),
+            self.read_mask.contains(&Property::Tags),
+        ) {
+            (Some(tags), true) => Ok(tags),
+            _ => Err(Error::Unset(Property::Tags)),
+        }
+    }
+}
+
+impl PropertyGuardMut {
+    pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> {
+        match (
+            token.chunks.as_deref(),
+            self.read_mask.contains(&Property::Chunks),
+        ) {
+            (Some(chunks), true) => Ok(chunks),
+            _ => Err(Error::Unset(Property::Chunks)),
+        }
+    }
+
+    pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> {
+        match (
+            token.tags.as_ref(),
+            self.read_mask.contains(&Property::Tags),
+        ) {
+            (Some(tags), true) => Ok(tags),
+            _ => Err(Error::Unset(Property::Tags)),
+        }
+    }
+
+    pub fn chunks_mut<'a, 't>(
+        &self,
+        token: &'a mut Token<'t>,
+    ) -> Result<&'a mut Vec<String>, Error> {
+        match (
+            token.chunks.as_mut(),
+            self.write_mask.contains(&Property::Chunks),
+        ) {
+            (Some(chunks), true) => Ok(chunks),
+            _ => Err(Error::Unset(Property::Chunks)),
+        }
+    }
+
+    pub fn tags_mut<'a, 't>(&self, token: &'a mut Token<'t>) -> Result<&'a mut Tags<'t>, Error> {
+        match (
+            token.tags.as_mut(),
+            self.write_mask.contains(&Property::Tags),
+        ) {
+            (Some(tags), true) => Ok(tags),
+            _ => Err(Error::Unset(Property::Tags)),
+        }
+    }
+
+    pub fn downgrade(self) -> PropertyGuard {
+        PropertyGuard {
+            read_mask: self.read_mask,
+        }
+    }
+}
+
+pub trait CreatePipe<T>: Sized {
+    fn new(components: T) -> Result<Self, Error>;
+}
+
+macro_rules! make_subpipe {
+    ($pipe:ty, $first:expr) => {
+        Ok::<_, crate::Error>($first)
+    };
+    ($pipe:ty, $first:expr, $($name:expr),+) => {
+        <$pipe>::new(($first, $($name,)+))
+    }
+}
+
+macro_rules! impl_pipeline {
+    ( $first:ident, $last:ident, $($name:ident),*) => {
+        // Case 1: Tokenize -> Transform -> ... -> Transform
+        impl<$first: Tokenize, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for tokenize::Pipeline<($first, $($name,)* $last)> {
+            #[allow(non_snake_case, unused_mut)]
+            fn new(components: ($first,  $($name,)* $last)) -> Result<Self, Error> {
+                let (ref $first, $(ref $name,)* ref $last) = components;
+
+                let mut properties = $first.properties();
+                $(properties = properties.chain($name.properties());)*
+                properties.chain($last.properties());
+
+                if !properties.reads_without_write().next().is_none() {
+                    return Err(Error::InvalidPipeline(properties.reads_without_write().collect()));
+                }
+
+                Ok(tokenize::Pipeline(components, properties))
+            }
+        }
+
+        impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for tokenize::Pipeline<($first, $($name,)* $last)> {
+            fn properties(&self) -> PropertiesMut {
+                self.1
+            }
+
+            #[allow(non_snake_case)]
+            fn tokenize<'t>(&'t self, text: &'t str) -> Box<dyn Iterator<Item = Sentence<'t>> + 't> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+                let sentences = $first.tokenize(text).map(move |mut sentence| {
+                    $(sentence = $name.transform(sentence).unwrap();)*
+                    sentence = $last.transform(sentence).unwrap();
+                    sentence
+                });
+
+                Box::new(sentences)
+            }
+
+            #[allow(non_snake_case, unused_mut)]
+            fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option<Sentence> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+                let mut sentence = $first.tokenize_sentence(sentence)?;
+                $(sentence = $name.transform(sentence).unwrap();)*
+                Some($last.transform(sentence).unwrap())
+            }
+
+            #[allow(non_snake_case)]
+            fn test(&self) -> Result<(), crate::Error> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+
+                let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?;
+                subpipe.test()?;
+
+                $last.test(subpipe)?;
+
+                Ok(())
+            }
+        }
+
+        // Case 2: Transform -> ... -> Transform
+        impl<$first: Transform, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for transform::Pipeline<($first, $($name,)* $last)> {
+            #[allow(non_snake_case, unused_mut)]
+            fn new(components: ($first,  $($name,)* $last)) -> Result<Self, Error> {
+                let (ref $first, $(ref $name,)* ref $last) = components;
+
+                let mut properties = $first.properties();
+                $(properties = properties.chain($name.properties());)*
+                properties.chain($last.properties());
+
+                Ok(transform::Pipeline(components, properties))
+            }
+        }
+
+        impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for transform::Pipeline<($first, $($name,)* $last)> {
+            fn properties(&self) -> PropertiesMut {
+                self.1
+            }
+
+            #[allow(non_snake_case)]
+            fn transform<'t>(&'t self, mut sentence: Sentence<'t>) -> Result<Sentence<'t>, crate::properties::Error> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+                sentence = $first.transform(sentence)?;
+                $(sentence = $name.transform(sentence)?;)*
+                sentence = $last.transform(sentence)?;
+                Ok(sentence)
+            }
+
+            #[allow(non_snake_case)]
+            fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+
+                $first.test(&tokenizer)?;
+                let tokenizer_pipe = tokenize::Pipeline::new((&tokenizer, $first))?;
+                let subpipe = make_subpipe!(transform::Pipeline<_>, $($name,)* $last)?;
+
+                subpipe.test(tokenizer_pipe)?;
+                Ok(())
+            }
+        }
+
+        // Case 3: Tokenize -> Transform -> ... -> Transform -> Suggest
+        impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> CreatePipe<($first, $($name,)* $last)> for Pipeline<($first, $($name,)* $last)> {
+            #[allow(non_snake_case, unused_mut)]
+            fn new(components: ($first,  $($name,)* $last)) -> Result<Self, Error> {
+                let (ref $first, $(ref $name,)* ref $last) = components;
+
+                let mut properties = $first.properties();
+                $(properties = properties.chain($name.properties());)*
+                properties.chain($last.properties().write(&[]));
+
+                if !properties.reads_without_write().next().is_none() {
+                    return Err(Error::InvalidPipeline(properties.reads_without_write().collect()));
+                }
+
+                Ok(Pipeline(components, properties))
+            }
+        }
+
+        impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> {
+            pub fn properties(&self) -> PropertiesMut {
+                self.1
+            }
+
+            #[allow(non_snake_case, unused_mut)]
+            pub fn suggest<'t>(&'t self, text: &'t str) -> impl Iterator<Item = Vec<Suggestion>> + 't {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+
+                let sentences = $first.tokenize(text).map(move |mut sentence| {
+                    $(sentence = $name.transform(sentence).unwrap();)*
+                    $last.suggest(&sentence).unwrap()
+                });
+
+                sentences
+            }
+
+            #[allow(non_snake_case, unused_mut)]
+            pub fn correct<'t>(&'t self, text: &'t str) -> impl Iterator<Item = String> + 't {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+
+                let sentences = $first.tokenize(text).map(move |mut sentence| {
+                    $(sentence = $name.transform(sentence).unwrap();)*
+                    $last.correct(&sentence).unwrap()
+                });
+
+                sentences
+            }
+
+            #[allow(non_snake_case)]
+            pub fn test(&self) -> Result<(), crate::Error> {
+                let (ref $first, $(ref $name,)* ref $last) = self.0;
+
+                let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?;
+                subpipe.test()?;
+
+                $last.test(subpipe)?;
+
+                Ok(())
+            }
+        }
+    };
+}
+
+impl_pipeline! { A, B, }
+impl_pipeline! { A, C, B  }
+impl_pipeline! { A, D, B, C }
+impl_pipeline! { A, E, B, C, D }
diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs
index 207c289..c684a53 100644
--- a/nlprule/src/rule/disambiguation.rs
+++ b/nlprule/src/rule/disambiguation.rs
@@ -1,6 +1,6 @@
 use std::ops::Range;
 
-use crate::types::*;
+use crate::{properties::PropertyGuardMut, types::*};
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 
@@ -52,21 +52,25 @@ pub enum Disambiguation {
 }
 
 impl Disambiguation {
-    pub fn apply<'t>(&'t self, groups: Vec<Vec<&mut Token<'t>>>) {
+    pub fn apply<'t>(
+        &'t self,
+        groups: Vec<Vec<&mut Token<'t>>>,
+        guard: PropertyGuardMut,
+    ) -> Result<(), crate::properties::Error> {
         match self {
             Disambiguation::Remove(data_or_filters) => {
                 for (group, data_or_filter) in groups.into_iter().zip(data_or_filters) {
                     for token in group.into_iter() {
                         match data_or_filter {
                             either::Left(data) => {
-                                token.tags_mut().retain(|x| {
+                                guard.tags_mut(token)?.retain(|x| {
                                     !(x.pos() == data.pos()
                                         && (data.lemma().as_str().is_empty()
                                             || x.lemma() == data.lemma()))
                                 });
                             }
                             either::Right(filter) => {
-                                filter.remove(token.tags_mut());
+                                filter.remove(guard.tags_mut(token)?);
                             }
                         }
                     }
@@ -78,31 +82,32 @@ impl Disambiguation {
                         match data_or_filter {
                             either::Left(limit) => {
                                 for token in group.into_iter() {
-                                    let last = token
-                                        .tags()
-                                        .iter()
-                                        .next()
-                                        .and_then(|x| {
-                                            if *x.lemma() != WordId::empty() {
-                                                Some(x.lemma().clone())
-                                            } else {
-                                                None
-                                            }
-                                        })
-                                        .unwrap_or_else(|| token.text().clone());
-
-                                    token.tags_mut().retain(|x| x.pos() == limit.pos());
-
-                                    if token.tags().is_empty() {
+                                    let last = {
+                                        let tags = guard.tags(token)?;
+                                        tags.iter()
+                                            .next()
+                                            .and_then(|x| {
+                                                if *x.lemma() != WordId::empty() {
+                                                    Some(x.lemma().clone())
+                                                } else {
+                                                    None
+                                                }
+                                            })
+                                            .unwrap_or_else(|| tags.id().clone())
+                                    };
+
+                                    guard.tags_mut(token)?.retain(|x| x.pos() == limit.pos());
+
+                                    if guard.tags(token)?.is_empty() {
                                         if *retain_last {
-                                            token
-                                                .tags_mut()
+                                            guard
+                                                .tags_mut(token)?
                                                 .push(WordData::new(last, limit.pos().clone()));
                                         } else {
-                                            let lemma = token.text().clone();
+                                            let lemma = guard.tags(token)?.id().clone();
 
-                                            token
-                                                .tags_mut()
+                                            guard
+                                                .tags_mut(token)?
                                                 .push(WordData::new(lemma, limit.pos().clone()));
                                         }
                                     }
@@ -110,7 +115,7 @@ impl Disambiguation {
                             }
                             either::Right(filter) => {
                                 for token in group.into_iter() {
-                                    filter.keep(token.tags_mut());
+                                    filter.keep(guard.tags_mut(token)?);
                                 }
                             }
                         }
@@ -122,15 +127,17 @@ impl Disambiguation {
                     for token in group.into_iter() {
                         let data = WordData::new(
                             if data.lemma().as_str().is_empty() {
-                                token.text().clone()
+                                guard.tags(token)?.id().clone()
                             } else {
                                 data.lemma().clone()
                             },
                             data.pos().clone(),
                         );
 
-                        token.tags_mut().push(data);
-                        token.tags_mut().retain(|x| !x.pos().as_str().is_empty());
+                        let tags = guard.tags_mut(token)?;
+
+                        tags.push(data);
+                        tags.retain(|x| !x.pos().as_str().is_empty());
                     }
                 }
             }
@@ -139,15 +146,17 @@ impl Disambiguation {
                     for token in group.into_iter() {
                         let data = WordData::new(
                             if data.lemma().as_str().is_empty() {
-                                token.text().clone()
+                                guard.tags(token)?.id().clone()
                             } else {
                                 data.lemma().clone()
                             },
                             data.pos().clone(),
                         );
 
-                        token.tags_mut().clear();
-                        token.tags_mut().push(data);
+                        let tags = guard.tags_mut(token)?;
+
+                        tags.clear();
+                        tags.push(data);
                     }
                 }
             }
@@ -160,14 +169,14 @@ impl Disambiguation {
                     for token in group.iter() {
                         if *use_mask_val {
                             for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) {
-                                *mask_val = *mask_val && PosFilter::and(filter, token.tags());
+                                *mask_val = *mask_val && PosFilter::and(filter, guard.tags(token)?);
                             }
                         }
                     }
                 }
 
                 if !filter_mask.iter().any(|x| *x) {
-                    return;
+                    return Ok(());
                 }
 
                 let to_apply: Vec<_> = filter_mask
@@ -188,16 +197,17 @@ impl Disambiguation {
                 {
                     if *use_mask_val {
                         for token in group.into_iter() {
-                            let before = token.tags().clone();
+                            let tags = guard.tags_mut(token)?;
+                            let before = tags.clone();
 
-                            PosFilter::apply(&to_apply, token.tags_mut());
+                            PosFilter::apply(&to_apply, tags);
 
                             if let Some(disambig) = disambig {
-                                disambig.keep(token.tags_mut());
+                                disambig.keep(tags);
                             }
 
-                            if token.tags().is_empty() {
-                                *token.tags_mut() = before;
+                            if tags.is_empty() {
+                                *tags = before;
                             }
                         }
                     }
@@ -205,6 +215,8 @@ impl Disambiguation {
             }
             Disambiguation::Nop => {}
         }
+
+        Ok(())
     }
 }
 
diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs
index 6e042de..d0fdef5 100644
--- a/nlprule/src/rule/engine/composition.rs
+++ b/nlprule/src/rule/engine/composition.rs
@@ -1,6 +1,6 @@
 use std::iter;
 
-use crate::{tokenizer::tag::Tagger, types::*, utils::regex::Regex};
+use crate::{components::tagger::Tagger, properties::*, types::*, utils::regex::Regex};
 use enum_dispatch::enum_dispatch;
 use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
@@ -161,10 +161,15 @@ pub struct Quantifier {
 
 #[enum_dispatch]
 pub trait Atomable: Send + Sync {
-    fn is_match(&self, context: Context, position: usize) -> bool;
+    fn is_match(&self, context: Context, position: usize)
+        -> Result<bool, crate::properties::Error>;
+
+    fn compute_properties(&self) -> Properties {
+        Properties::default()
+    }
 }
 
-#[enum_dispatch(Atomable)]
+#[enum_dispatch(Atomable, ReadProperties)]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub enum Atom {
     ChunkAtom(concrete::ChunkAtom),
@@ -180,7 +185,8 @@ pub enum Atom {
 }
 
 pub mod concrete {
-    use super::{Atomable, Context, Matcher, TextMatcher, WordDataMatcher};
+    use super::{Atomable, Context, Matcher, Properties, Property, TextMatcher, WordDataMatcher};
+    use lazy_static::lazy_static;
     use serde::{Deserialize, Serialize};
 
     #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -189,11 +195,25 @@ pub mod concrete {
     }
 
     impl Atomable for TextAtom {
-        fn is_match(&self, context: Context, position: usize) -> bool {
+        fn is_match(
+            &self,
+            context: Context,
+            position: usize,
+        ) -> Result<bool, crate::properties::Error> {
             let (sentence, _) = context;
 
-            self.matcher
-                .is_match(&sentence.index(position).text(), Some(context), None)
+            Ok(self.matcher.is_match(
+                sentence.guard().tags(sentence.index(position))?.id(),
+                Some(context),
+                None,
+            ))
+        }
+
+        fn compute_properties(&self) -> Properties {
+            lazy_static! {
+                static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]);
+            }
+            *PROPERTIES
         }
     }
 
@@ -203,11 +223,25 @@ pub mod concrete {
     }
 
     impl Atomable for ChunkAtom {
-        fn is_match(&self, context: Context, position: usize) -> bool {
+        fn is_match(
+            &self,
+            context: Context,
+            position: usize,
+        ) -> Result<bool, crate::properties::Error> {
             let (sentence, _) = context;
 
-            self.matcher
-                .is_slice_match(&sentence.index(position).chunks(), Some(context), None)
+            Ok(self.matcher.is_slice_match(
+                sentence.guard().chunks(sentence.index(position))?,
+                Some(context),
+                None,
+            ))
+        }
+
+        fn compute_properties(&self) -> Properties {
+            lazy_static! {
+                static ref PROPERTIES: Properties = Properties::default().read(&[Property::Chunks]);
+            }
+            *PROPERTIES
         }
     }
 
@@ -217,10 +251,14 @@ pub mod concrete {
     }
 
     impl Atomable for SpaceBeforeAtom {
-        fn is_match(&self, context: Context, position: usize) -> bool {
+        fn is_match(
+            &self,
+            context: Context,
+            position: usize,
+        ) -> Result<bool, crate::properties::Error> {
             let (sentence, _) = context;
 
-            sentence.index(position).has_space_before() == self.value
+            Ok(sentence.index(position).has_space_before() == self.value)
         }
     }
 
@@ -231,12 +269,24 @@ pub mod concrete {
     }
 
     impl Atomable for WordDataAtom {
-        fn is_match(&self, context: Context, position: usize) -> bool {
+        fn is_match(
+            &self,
+            context: Context,
+            position: usize,
+        ) -> Result<bool, crate::properties::Error> {
             let (sentence, _) = context;
-            let tags = sentence.index(position).tags().iter();
+            let tags = sentence.guard().tags(sentence.index(position))?.iter();
 
-            self.matcher
-                .is_match(tags, Some(context), Some(self.case_sensitive))
+            Ok(self
+                .matcher
+                .is_match(tags, Some(context), Some(self.case_sensitive)))
+        }
+
+        fn compute_properties(&self) -> Properties {
+            lazy_static! {
+                static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]);
+            }
+            *PROPERTIES
         }
     }
 }
@@ -245,8 +295,12 @@ pub mod concrete {
 pub struct TrueAtom {}
 
 impl Atomable for TrueAtom {
-    fn is_match(&self, _context: Context, _position: usize) -> bool {
-        true
+    fn is_match(
+        &self,
+        _context: Context,
+        _position: usize,
+    ) -> Result<bool, crate::properties::Error> {
+        Ok(true)
     }
 }
 
@@ -254,8 +308,12 @@ impl Atomable for TrueAtom {
 pub struct FalseAtom {}
 
 impl Atomable for FalseAtom {
-    fn is_match(&self, _context: Context, _position: usize) -> bool {
-        false
+    fn is_match(
+        &self,
+        _context: Context,
+        _position: usize,
+    ) -> Result<bool, crate::properties::Error> {
+        Ok(false)
     }
 }
 
@@ -265,8 +323,22 @@ pub struct AndAtom {
 }
 
 impl Atomable for AndAtom {
-    fn is_match(&self, context: Context, position: usize) -> bool {
-        self.atoms.iter().all(|x| x.is_match(context, position))
+    fn is_match(
+        &self,
+        context: Context,
+        position: usize,
+    ) -> Result<bool, crate::properties::Error> {
+        for atom in &self.atoms {
+            if !atom.is_match(context, position)? {
+                return Ok(false);
+            }
+        }
+
+        Ok(true)
+    }
+
+    fn compute_properties(&self) -> Properties {
+        self.atoms.iter().map(Atom::compute_properties).collect()
     }
 }
 
@@ -276,8 +348,22 @@ pub struct OrAtom {
 }
 
 impl Atomable for OrAtom {
-    fn is_match(&self, context: Context, position: usize) -> bool {
-        self.atoms.iter().any(|x| x.is_match(context, position))
+    fn is_match(
+        &self,
+        context: Context,
+        position: usize,
+    ) -> Result<bool, crate::properties::Error> {
+        for atom in &self.atoms {
+            if atom.is_match(context, position)? {
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
+    }
+
+    fn compute_properties(&self) -> Properties {
+        self.atoms.iter().map(Atom::compute_properties).collect()
     }
 }
 
@@ -287,8 +373,16 @@ pub struct NotAtom {
 }
 
 impl Atomable for NotAtom {
-    fn is_match(&self, context: Context, position: usize) -> bool {
-        !self.atom.is_match(context, position)
+    fn is_match(
+        &self,
+        context: Context,
+        position: usize,
+    ) -> Result<bool, crate::properties::Error> {
+        Ok(!self.atom.is_match(context, position)?)
+    }
+
+    fn compute_properties(&self) -> Properties {
+        self.atom.compute_properties()
     }
 }
 
@@ -299,15 +393,25 @@ pub struct OffsetAtom {
 }
 
 impl Atomable for OffsetAtom {
-    fn is_match(&self, context: Context, position: usize) -> bool {
+    fn is_match(
+        &self,
+        context: Context,
+        position: usize,
+    ) -> Result<bool, crate::properties::Error> {
         let (sentence, _) = context;
         let new_position = position as isize + self.offset;
 
-        if new_position < 0 || (new_position as usize) >= sentence.len() {
-            false
-        } else {
-            self.atom.is_match(context, new_position as usize)
-        }
+        Ok(
+            if new_position < 0 || (new_position as usize) >= sentence.len() {
+                false
+            } else {
+                self.atom.is_match(context, new_position as usize)?
+            },
+        )
+    }
+
+    fn compute_properties(&self) -> Properties {
+        self.atom.compute_properties()
     }
 }
 
@@ -357,33 +461,20 @@ impl GraphId {
     }
 }
 
-lazy_static! {
-    static ref SENT_START: Token<'static> = Token::new(
-        WordId::empty(),
-        Tags::new(vec![WordData::new(
-            WordId::empty(),
-            PosId::special(SpecialPos::SentStart),
-        )],),
-        Span::default(),
-        false,
-        false,
-        Vec::new(),
-    );
-}
-
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
 pub struct MatchSentence<'t> {
     sentence: &'t Sentence<'t>,
+    guard: PropertyGuard,
 }
 
 impl<'t> MatchSentence<'t> {
-    pub fn new(sentence: &'t Sentence<'t>) -> Self {
-        MatchSentence { sentence }
+    pub fn new(sentence: &'t Sentence<'t>, guard: PropertyGuard) -> Self {
+        MatchSentence { sentence, guard }
     }
 
     pub fn index(&self, index: usize) -> &Token {
         match index {
-            0 => &*SENT_START,
+            0 => &crate::types::SENT_START,
             i => &self.sentence.tokens()[i - 1],
         }
     }
@@ -409,6 +500,10 @@ impl<'t> MatchSentence<'t> {
         self.sentence.tagger()
     }
 
+    pub fn guard(&self) -> &PropertyGuard {
+        &self.guard
+    }
+
     pub fn span(&self) -> &Span {
         self.sentence.span()
     }
@@ -518,7 +613,19 @@ pub struct Composition {
 }
 
 impl Composition {
-    fn next_can_match(&self, context: Context, position: usize, index: usize) -> bool {
+    pub fn compute_properties(&self) -> Properties {
+        self.parts
+            .iter()
+            .map(|part| part.atom.compute_properties())
+            .collect()
+    }
+
+    fn next_can_match(
+        &self,
+        context: Context,
+        position: usize,
+        index: usize,
+    ) -> Result<bool, crate::properties::Error> {
         let next_required_pos = match self.parts[index + 1..]
             .iter()
             .position(|x| x.quantifier.min > 0)
@@ -527,9 +634,13 @@ impl Composition {
             None => self.parts.len(),
         };
 
-        self.parts[index + 1..next_required_pos]
-            .iter()
-            .any(|x| x.atom.is_match(context, position))
+        for part in &self.parts[index + 1..next_required_pos] {
+            if part.atom.is_match(context, position)? {
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
     }
 
     fn apply_recursive<'t>(
@@ -538,7 +649,7 @@ impl Composition {
         mut position: usize,
         mut cur_atom_idx: usize,
         mut graph: MatchGraph<'t>,
-    ) -> Option<MatchGraph<'t>> {
+    ) -> Result<Option<MatchGraph<'t>>, crate::properties::Error> {
         let mut cur_count = 0;
         let is_match = loop {
             if cur_atom_idx >= self.parts.len() {
@@ -561,21 +672,23 @@ impl Composition {
             }
 
             if cur_count >= part.quantifier.min && cur_atom_idx + 1 < self.parts.len() {
-                if !part.greedy && self.next_can_match((sentence, &graph), position, cur_atom_idx) {
+                if !part.greedy
+                    && self.next_can_match((sentence, &graph), position, cur_atom_idx)?
+                {
                     cur_atom_idx += 1;
                     cur_count = 0;
                     continue;
                 }
                 if part.greedy {
                     if let Some(graph) =
-                        self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone())
+                        self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone())?
                     {
-                        return Some(graph);
+                        return Ok(Some(graph));
                     }
                 }
             }
 
-            if part.atom.is_match((sentence, &graph), position) {
+            if part.atom.is_match((sentence, &graph), position)? {
                 let group = &mut graph.groups[cur_atom_idx + 1];
 
                 // set the group beginning if the char end was zero (i. e. the group was empty)
@@ -599,19 +712,21 @@ impl Composition {
             cur_atom_idx += 1;
         }
 
-        if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] {
-            graph.fill_empty(sentence);
-            Some(graph)
-        } else {
-            None
-        }
+        Ok(
+            if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] {
+                graph.fill_empty(sentence);
+                Some(graph)
+            } else {
+                None
+            },
+        )
     }
 
     pub fn apply<'t>(
         &'t self,
         sentence: &'t MatchSentence,
         start: usize,
-    ) -> Option<MatchGraph<'t>> {
+    ) -> Result<Option<MatchGraph<'t>>, crate::properties::Error> {
         // this path is extremely hot so more optimizations are done
 
         // the first matcher can never rely on the match graph, so we use an empty default graph for the first match
@@ -623,9 +738,9 @@ impl Composition {
         if self.parts[0].quantifier.min > 0
             && !self.parts[0]
                 .atom
-                .is_match((sentence, &DEFAULT_GRAPH), start)
+                .is_match((sentence, &DEFAULT_GRAPH), start)?
         {
-            return None;
+            return Ok(None);
         }
 
         let position = start;
diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs
index 22ec069..ceda355 100644
--- a/nlprule/src/rule/engine/mod.rs
+++ b/nlprule/src/rule/engine/mod.rs
@@ -1,4 +1,7 @@
+use std::iter;
+
 use crate::{
+    properties::*,
     types::*,
     utils::regex::{CaptureMatches, Regex},
 };
@@ -14,14 +17,18 @@ pub struct TokenEngine {
 }
 
 impl TokenEngine {
-    fn get_match<'t>(&'t self, sentence: &'t MatchSentence, i: usize) -> Option<MatchGraph<'t>> {
-        if let Some(graph) = self.composition.apply(sentence, i) {
+    fn get_match<'t>(
+        &'t self,
+        sentence: &'t MatchSentence,
+        i: usize,
+    ) -> Result<Option<MatchGraph<'t>>, crate::properties::Error> {
+        if let Some(graph) = self.composition.apply(sentence, i)? {
             let mut blocked = false;
 
             // TODO: cache / move to outer loop
             for i in 0..sentence.len() {
                 for antipattern in &self.antipatterns {
-                    if let Some(anti_graph) = antipattern.apply(sentence, i) {
+                    if let Some(anti_graph) = antipattern.apply(sentence, i)? {
                         let anti_start = anti_graph.by_index(0).span.char().start;
                         let anti_end = anti_graph
                             .by_index(anti_graph.groups().len() - 1)
@@ -44,11 +51,11 @@ impl TokenEngine {
             }
 
             if !blocked {
-                return Some(graph);
+                return Ok(Some(graph));
             }
         }
 
-        None
+        Ok(None)
     }
 }
 
@@ -84,7 +91,7 @@ pub struct EngineMatches<'a, 't> {
 }
 
 impl<'a, 't> Iterator for EngineMatches<'a, 't> {
-    type Item = MatchGraph<'t>;
+    type Item = Result<MatchGraph<'t>, crate::properties::Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
         let sentence = self.sentence;
@@ -93,22 +100,25 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> {
 
         match &mut self.inner {
             InnerMatches::Token(inner) => (inner.index..sentence.len()).find_map(|i| {
-                inner.engine.get_match(sentence, i).and_then(|graph| {
-                    let start_group = graph.by_id(start_id);
-                    let end_group = graph.by_id(end_id);
+                match inner.engine.get_match(sentence, i) {
+                    Ok(graph) => graph.and_then(|graph| {
+                        let start_group = graph.by_id(start_id);
+                        let end_group = graph.by_id(end_id);
 
-                    let start = start_group.span.char().start - sentence.span().char().start;
-                    let end = end_group.span.char().end - sentence.span().char().start;
+                        let start = start_group.span.char().start - sentence.span().char().start;
+                        let end = end_group.span.char().end - sentence.span().char().start;
 
-                    if inner.mask[start..end].iter().all(|x| !x) {
-                        inner.mask[start..end].iter_mut().for_each(|x| *x = true);
+                        if inner.mask[start..end].iter().all(|x| !x) {
+                            inner.mask[start..end].iter_mut().for_each(|x| *x = true);
 
-                        inner.index += 1;
-                        Some(graph)
-                    } else {
-                        None
-                    }
-                })
+                            inner.index += 1;
+                            Some(Ok(graph))
+                        } else {
+                            None
+                        }
+                    }),
+                    Err(err) => Some(Err(err)),
+                }
             }),
             InnerMatches::Text(inner) => inner.captures.next().map(|captures| {
                 let bi_to_ci = &inner.byte_idx_to_char_idx;
@@ -134,13 +144,25 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> {
                     }
                 }
 
-                MatchGraph::new(groups, inner.id_to_idx)
+                Ok(MatchGraph::new(groups, inner.id_to_idx))
             }),
         }
     }
 }
 
 impl Engine {
+    pub fn compute_properties(&self) -> Properties {
+        match &self {
+            Engine::Token(engine) => engine
+                .antipatterns
+                .iter()
+                .map(|x| x.compute_properties())
+                .chain(iter::once(engine.composition.compute_properties()))
+                .collect(),
+            Engine::Text(_, _) => Properties::default(),
+        }
+    }
+
     pub fn get_matches<'a, 't>(
         &'a self,
         sentence: &'t MatchSentence,
diff --git a/nlprule/src/rule/id.rs b/nlprule/src/rule/id.rs
index 7c0c21b..850843f 100644
--- a/nlprule/src/rule/id.rs
+++ b/nlprule/src/rule/id.rs
@@ -17,11 +17,10 @@
 //! Select individal rules:
 //!
 //! ```no_run
-//! use nlprule::{Tokenizer, Rules, rule::id::Category};
+//! use nlprule::{lang::en, rule::id::Category};
 //! use std::convert::TryInto;
 //!
-//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?;
-//! let mut rules = Rules::new("path/to/en_rules.bin")?;
+//! let mut rules = en::rules();
 //!
 //! // disable rules named "confusion_due_do" in category "confused_words"
 //! rules
diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs
index 5be2342..ce10292 100644
--- a/nlprule/src/rule/mod.rs
+++ b/nlprule/src/rule/mod.rs
@@ -1,16 +1,17 @@
 //! Implementations related to single rules.
 
-use crate::types::*;
 use crate::{
     filter::{Filter, Filterable},
-    tokenizer::Tokenizer,
+    properties::*,
+    types::*,
     utils,
 };
 use itertools::Itertools;
-use log::{error, info, warn};
+use lazy_static::lazy_static;
+use log::{debug, error};
 use serde::{Deserialize, Serialize};
-use std::collections::HashSet;
 use std::fmt;
+use std::{collections::HashSet, iter};
 
 pub(crate) mod disambiguation;
 pub(crate) mod engine;
@@ -38,7 +39,11 @@ pub(crate) struct Unification {
 }
 
 impl Unification {
-    pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> bool {
+    pub fn keep(
+        &self,
+        graph: &MatchGraph,
+        sentence: &MatchSentence,
+    ) -> Result<bool, crate::properties::Error> {
         let filters: Vec<_> = self.filters.iter().multi_cartesian_product().collect();
 
         let mut filter_mask: Vec<_> = filters.iter().map(|_| true).collect();
@@ -48,18 +53,22 @@ impl Unification {
             if maybe_mask_val.is_some() {
                 for token in group.tokens(sentence) {
                     for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) {
-                        *mask_val = *mask_val && PosFilter::and(filter, token.tags());
+                        *mask_val =
+                            *mask_val && PosFilter::and(filter, sentence.guard().tags(token)?);
                     }
                 }
             }
         }
 
         let result = filter_mask.iter().any(|x| *x);
-        if negate {
-            !result
-        } else {
-            result
+        Ok(if negate { !result } else { result })
+    }
+
+    pub fn compute_properties(&self) -> Properties {
+        lazy_static! {
+            static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]);
         }
+        *PROPERTIES
     }
 }
 
@@ -126,21 +135,33 @@ impl Changes {
 }
 
 impl DisambiguationRule {
+    pub fn compute_properties(&self) -> PropertiesMut {
+        iter::once(self.engine.compute_properties())
+            .chain(self.unification.iter().map(|x| x.compute_properties()))
+            .collect::<Properties>()
+            .write(&[Property::Tags])
+    }
+
     /// Get a unique identifier of this rule.
     pub fn id(&self) -> &Index {
         &self.id
     }
 
-    pub(crate) fn apply<'t>(&'t self, sentence: &MatchSentence<'t>) -> Changes {
+    pub(crate) fn apply<'t>(
+        &'t self,
+        sentence: &MatchSentence<'t>,
+    ) -> Result<Changes, crate::properties::Error> {
         if matches!(self.disambiguations, disambiguation::Disambiguation::Nop) {
-            return Changes::default();
+            return Ok(Changes::default());
         }
 
         let mut all_spans = Vec::new();
 
         for graph in self.engine.get_matches(sentence, self.start, self.end) {
+            let graph = graph?;
+
             if let Some(unification) = &self.unification {
-                if !unification.keep(&graph, sentence) {
+                if !unification.keep(&graph, sentence)? {
                     continue;
                 }
             }
@@ -165,11 +186,16 @@ impl DisambiguationRule {
             all_spans.push(spans);
         }
 
-        Changes(all_spans)
+        Ok(Changes(all_spans))
     }
 
-    pub(crate) fn change<'t>(&'t self, sentence: &mut Sentence<'t>, changes: Changes) {
-        log::info!("applying {}", self.id);
+    pub(crate) fn change<'t>(
+        &'t self,
+        sentence: &mut Sentence<'t>,
+        changes: Changes,
+        guard: PropertyGuardMut,
+    ) -> Result<(), crate::properties::Error> {
+        debug!("applying {}", self.id);
 
         for spans in changes.0 {
             let mut groups = Vec::new();
@@ -185,43 +211,51 @@ impl DisambiguationRule {
                 groups.push(group);
             }
 
-            self.disambiguations.apply(groups);
+            self.disambiguations.apply(groups, guard)?;
         }
+
+        Ok(())
     }
 
     /// Often there are examples associated with a rule.
     /// This method checks whether the correct action is taken in the examples.
-    pub fn test(&self, tokenizer: &Tokenizer) -> bool {
+    pub(crate) fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
         let mut passes = Vec::new();
 
-        for (i, test) in self.examples.iter().enumerate() {
+        for test in self.examples.iter() {
             let text = match test {
                 disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(),
                 disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(),
             };
 
             // by convention examples are always considered as one sentence even if the sentencizer would split
-            let sentence_before = tokenizer.disambiguate_up_to_id(
-                tokenizer
-                    .tokenize(text)
-                    .expect("test text must not be empty"),
-                Some(&self.id),
-            );
+            let sentence_before = tokenizer
+                .tokenize_sentence(text)
+                .expect("test text must not be empty");
 
             // shift the sentence to the right before matching to make sure
             // nothing assumes the sentene starts from absolute index zero
             let shift_delta = Position { byte: 1, char: 1 };
-            let sentence_before_complete = sentence_before.clone().rshift(shift_delta);
+            let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta);
+
+            let guard = self
+                .compute_properties()
+                .build(&mut sentence_before_complete)?;
+
             let changes = self
-                .apply(&MatchSentence::new(&sentence_before_complete))
+                .apply(&MatchSentence::new(
+                    &sentence_before_complete,
+                    guard.downgrade(),
+                ))
+                .unwrap()
                 .lshift(shift_delta);
             let mut sentence_after = sentence_before.clone();
 
             if !changes.is_empty() {
-                self.change(&mut sentence_after, changes);
+                self.change(&mut sentence_after, changes, guard).unwrap();
             }
 
-            info!("Tokens: {:#?}", sentence_before);
+            debug!("Tokens: {:#?}", sentence_before);
 
             let pass = match test {
                 disambiguation::DisambiguationExample::Unchanged(_) => {
@@ -238,38 +272,29 @@ impl DisambiguationRule {
                         .find(|x| *x.span().char() == change.char_span)
                         .unwrap();
 
-                    let unordered_tags = after.tags().iter().collect::<HashSet<&WordData>>();
+                    let unordered_tags =
+                        after.tags().unwrap().iter().collect::<HashSet<&WordData>>();
                     let unordered_tags_change = change.after.iter().collect::<HashSet<&WordData>>();
 
-                    let pass = unordered_tags == unordered_tags_change;
-                    if !pass {
-                        println!("{:#?} ---- {:#?}", unordered_tags, unordered_tags_change);
-                    }
-                    pass
+                    unordered_tags == unordered_tags_change
                 }
             };
 
             if !pass {
-                let error_str = format!(
+                error!(
                     "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.",
-                    self.id, test, sentence_before, sentence_after,
-                );
-
-                if tokenizer
-                    .lang_options()
-                    .known_failures
-                    .contains(&format!("{}:{}", self.id, i))
-                {
-                    warn!("{}", error_str)
-                } else {
-                    error!("{}", error_str)
-                }
+                    self.id, test, sentence_before, sentence_after
+                )
             }
 
             passes.push(pass);
         }
 
-        passes.iter().all(|x| *x)
+        if passes.iter().all(|x| *x) {
+            Ok(())
+        } else {
+            Err(crate::Error::TestFailed)
+        }
     }
 }
 
@@ -280,82 +305,92 @@ pub struct Suggestions<'a, 't> {
     sentence: &'t MatchSentence<'t>,
 }
 
-impl<'a, 't> Iterator for Suggestions<'a, 't> {
-    type Item = Suggestion;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let rule = self.rule;
-        let sentence = self.sentence;
-        let (start, end) = (self.rule.start, self.rule.end);
-
-        self.matches.find_map(|graph| {
-            if let Some(unification) = &rule.unification {
-                if !unification.keep(&graph, sentence) {
-                    return None;
-                }
+impl<'a, 't> Suggestions<'a, 't> {
+    fn suggest_from_graph(
+        graph: Result<MatchGraph, crate::properties::Error>,
+        rule: &'a Rule,
+        sentence: &'t MatchSentence<'t>,
+    ) -> Result<Option<Suggestion>, crate::properties::Error> {
+        let graph = graph?;
+
+        if let Some(unification) = &rule.unification {
+            if !unification.keep(&graph, sentence)? {
+                return Ok(None);
             }
+        }
+
+        let start_group = graph.by_id(rule.start);
+        let end_group = graph.by_id(rule.end);
 
-            let start_group = graph.by_id(start);
-            let end_group = graph.by_id(end);
+        let replacements: Vec<String> = rule
+            .suggesters
+            .iter()
+            .filter_map(|x| x.apply(sentence, &graph, rule.start, rule.end))
+            .collect();
 
-            let replacements: Vec<String> = rule
-                .suggesters
+        let start = if replacements
+            .iter()
+            .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c)))
+        {
+            let first_token = graph.groups()[graph.get_index(rule.start)..]
                 .iter()
-                .filter_map(|x| x.apply(sentence, &graph, start, end))
-                .collect();
+                .find_map(|x| x.tokens(sentence).next())
+                .unwrap();
 
-            let start = if replacements
+            let idx = sentence
                 .iter()
-                .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c)))
-            {
-                let first_token = graph.groups()[graph.get_index(start)..]
-                    .iter()
-                    .find_map(|x| x.tokens(sentence).next())
-                    .unwrap();
-
-                let idx = sentence
-                    .iter()
-                    .position(|x| std::ptr::eq(x, first_token))
-                    .unwrap_or(0);
-
-                if idx > 0 {
-                    sentence.index(idx - 1).span().end()
-                } else {
-                    start_group.span.start()
-                }
+                .position(|x| std::ptr::eq(x, first_token))
+                .unwrap_or(0);
+
+            if idx > 0 {
+                sentence.index(idx - 1).span().end()
             } else {
                 start_group.span.start()
-            };
-            let end = end_group.span.end();
-
-            // this should never happen, but just return None instead of raising an Error
-            // `end` COULD be equal to `start` if the suggestion is to insert text at this position
-            if end < start {
-                return None;
             }
+        } else {
+            start_group.span.start()
+        };
+        let end = end_group.span.end();
+
+        // this should never happen, but just return None instead of raising an Error
+        // `end` COULD be equal to `start` if the suggestion is to insert text at this position
+        if end < start {
+            return Ok(None);
+        }
 
-            let text_before = sentence.slice(Span::from_positions(start, end));
+        let text_before = sentence.slice(Span::from_positions(start, end));
+
+        // fix e. g. "Super , dass"
+        let replacements: Vec<String> = replacements
+            .into_iter()
+            .filter(|suggestion| *suggestion != text_before)
+            .map(|x| utils::fix_nospace_chars(&x))
+            .collect();
+
+        Ok(if !replacements.is_empty() {
+            Some(Suggestion::new(
+                rule.id.to_string(),
+                rule.message
+                    .apply(sentence, &graph, rule.start, rule.end)
+                    .expect("Rules must have a message."),
+                Span::from_positions(start, end),
+                replacements,
+            ))
+        } else {
+            None
+        })
+    }
+}
 
-            // fix e. g. "Super , dass"
-            let replacements: Vec<String> = replacements
-                .into_iter()
-                .filter(|suggestion| *suggestion != text_before)
-                .map(|x| utils::fix_nospace_chars(&x))
-                .collect();
+impl<'a, 't> Iterator for Suggestions<'a, 't> {
+    type Item = Result<Suggestion, crate::properties::Error>;
 
-            if !replacements.is_empty() {
-                Some(Suggestion::new(
-                    rule.id.to_string(),
-                    rule.message
-                        .apply(sentence, &graph, rule.start, rule.end)
-                        .expect("Rules must have a message."),
-                    Span::from_positions(start, end),
-                    replacements,
-                ))
-            } else {
-                None
-            }
-        })
+    fn next(&mut self) -> Option<Self::Item> {
+        let rule = self.rule;
+        let sentence = self.sentence;
+
+        self.matches
+            .find_map(|graph| Suggestions::suggest_from_graph(graph, rule, sentence).transpose())
     }
 }
 
@@ -414,6 +449,12 @@ impl Rule {
         self.enabled
     }
 
+    pub fn compute_properties(&self) -> Properties {
+        iter::once(self.engine.compute_properties())
+            .chain(self.unification.iter().map(|x| x.compute_properties()))
+            .collect()
+    }
+
     /// Get a unique identifier of this rule.
     pub fn id(&self) -> &Index {
         &self.id
@@ -459,7 +500,7 @@ impl Rule {
 
     /// Grammar rules always have at least one example associated with them.
     /// This method checks whether the correct action is taken in the examples.
-    pub fn test(&self, tokenizer: &Tokenizer) -> bool {
+    pub(crate) fn test<TOK: Tokenize>(&self, tokenizer: TOK) -> Result<(), crate::Error> {
         let mut passes = Vec::new();
 
         // make sure relative position is handled correctly
@@ -470,17 +511,17 @@ impl Rule {
         for test in self.examples.iter() {
             // by convention examples are always considered as one sentence even if the sentencizer would split
             let sentence = tokenizer
-                .disambiguate(
-                    tokenizer
-                        .tokenize(&test.text())
-                        .expect("test text must not be empty."),
-                )
+                .tokenize_sentence(&test.text())
+                .expect("test text must not be empty.")
                 .rshift(shift_delta);
 
-            info!("Sentence: {:#?}", sentence);
+            debug!("Sentence: {:#?}", sentence);
             let suggestions: Vec<_> = self
-                .apply(&MatchSentence::new(&sentence))
-                .map(|s| s.lshift(shift_delta))
+                .apply(&MatchSentence::new(
+                    &sentence,
+                    self.compute_properties().build(&sentence)?,
+                ))
+                .map(|s| s.unwrap().lshift(shift_delta))
                 .collect();
 
             let pass = if suggestions.len() > 1 {
@@ -495,7 +536,7 @@ impl Rule {
             };
 
             if !pass {
-                warn!(
+                error!(
                     "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.",
                     self.id,
                     test.text(),
@@ -507,6 +548,10 @@ impl Rule {
             passes.push(pass);
         }
 
-        passes.iter().all(|x| *x)
+        if passes.iter().all(|x| *x) {
+            Ok(())
+        } else {
+            Err(crate::Error::TestFailed)
+        }
     }
 }
diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs
deleted file mode 100644
index dc924fc..0000000
--- a/nlprule/src/rules.rs
+++ /dev/null
@@ -1,227 +0,0 @@
-//! Sets of grammatical error correction rules.
-
-use crate::types::*;
-use crate::utils::parallelism::MaybeParallelRefIterator;
-use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, tokenizer::Tokenizer, Error};
-use fs_err::File;
-use serde::{Deserialize, Serialize};
-use std::{
-    io::{BufReader, Read, Write},
-    iter::FromIterator,
-    path::Path,
-};
-
-/// Language-dependent options for a rule set.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub(crate) struct RulesLangOptions {
-    /// Whether to allow errors while constructing the rules.
-    pub allow_errors: bool,
-    /// Grammar Rule selectors to use in this set.
-    #[serde(default)]
-    pub ids: Vec<Selector>,
-    /// Grammar Rule selectors to ignore in this set.
-    #[serde(default)]
-    pub ignore_ids: Vec<Selector>,
-}
-
-impl Default for RulesLangOptions {
-    fn default() -> Self {
-        RulesLangOptions {
-            allow_errors: true,
-            ids: Vec::new(),
-            ignore_ids: Vec::new(),
-        }
-    }
-}
-
-/// A set of grammatical error correction rules.
-#[derive(Serialize, Deserialize, Default)]
-pub struct Rules {
-    pub(crate) rules: Vec<Rule>,
-}
-
-impl Rules {
-    /// Creates a new rule set from a path to a binary.
-    ///
-    /// # Errors
-    /// - If the file can not be opened.
-    /// - If the file content can not be deserialized to a rules set.
-    pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
-        let reader = BufReader::new(File::open(p.as_ref())?);
-        let rules: Rules = bincode::deserialize_from(reader)?;
-        Ok(rules)
-    }
-
-    /// Creates a new rules set from a reader.
-    pub fn from_reader<R: Read>(reader: R) -> Result<Self, Error> {
-        Ok(bincode::deserialize_from(reader)?)
-    }
-
-    /// Serializes this rules set to a writer.
-    pub fn to_writer<W: Write>(&self, writer: W) -> Result<(), Error> {
-        Ok(bincode::serialize_into(writer, &self)?)
-    }
-
-    /// All rules ordered by priority.
-    pub fn rules(&self) -> &[Rule] {
-        &self.rules
-    }
-
-    /// All rules ordered by priority (mutable).
-    pub fn rules_mut(&mut self) -> &mut [Rule] {
-        &mut self.rules
-    }
-
-    /// Returns an iterator over all rules matching the selector.
-    pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> {
-        RulesIter {
-            inner: self.rules.iter(),
-            selector: Some(selector),
-        }
-    }
-
-    /// Returns an iterator over all rules matching the selector (mutable).
-    pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> {
-        RulesIterMut {
-            inner: self.rules.iter_mut(),
-            selector: Some(selector),
-        }
-    }
-
-    /// Compute the suggestions for the given sentence by checking all rules.
-    pub fn apply(&self, sentence: &Sentence) -> Vec<Suggestion> {
-        let sentence = MatchSentence::new(sentence);
-
-        let mut output: Vec<(usize, Suggestion)> = self
-            .rules
-            .maybe_par_iter()
-            .enumerate()
-            .filter(|(_, rule)| rule.enabled())
-            .map(|(i, rule)| {
-                let mut output = Vec::new();
-
-                for suggestion in rule.apply(&sentence) {
-                    output.push((i, suggestion));
-                }
-
-                output
-            })
-            .flatten()
-            .collect();
-
-        output.sort_by(|(ia, a), (ib, b)| {
-            a.span()
-                .char()
-                .start
-                .cmp(&b.span().char().start)
-                .then_with(|| ib.cmp(ia))
-        });
-
-        let mut mask = vec![false; sentence.text().chars().count()];
-
-        output
-            .into_iter()
-            .filter_map(|(_, suggestion)| {
-                let span = suggestion.span().clone().lshift(sentence.span().start());
-
-                if mask[span.char().clone()].iter().all(|x| !x) {
-                    mask[span.char().clone()].iter_mut().for_each(|x| *x = true);
-                    Some(suggestion)
-                } else {
-                    None
-                }
-            })
-            .collect()
-    }
-
-    /// Compute the suggestions for a text by checking all rules.
-    pub fn suggest(&self, text: &str, tokenizer: &Tokenizer) -> Vec<Suggestion> {
-        if text.is_empty() {
-            return Vec::new();
-        }
-
-        let mut suggestions = Vec::new();
-
-        // get suggestions sentence by sentence
-        for sentence in tokenizer.pipe(text) {
-            suggestions.extend(self.apply(&sentence));
-        }
-
-        suggestions
-    }
-
-    /// Correct a text by first tokenizing, then finding all suggestions and choosing the first replacement of each suggestion.
-    pub fn correct(&self, text: &str, tokenizer: &Tokenizer) -> String {
-        let suggestions = self.suggest(text, tokenizer);
-        apply_suggestions(text, &suggestions)
-    }
-}
-
-/// Correct a text by applying suggestions to it.
-/// In the case of multiple possible replacements, always chooses the first one.
-pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String {
-    let mut offset: isize = 0;
-    let mut chars: Vec<_> = text.chars().collect();
-
-    for suggestion in suggestions {
-        let replacement: Vec<_> = suggestion.replacements()[0].chars().collect();
-        chars.splice(
-            (suggestion.span().char().start as isize + offset) as usize
-                ..(suggestion.span().char().end as isize + offset) as usize,
-            replacement.iter().cloned(),
-        );
-        offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize;
-    }
-
-    chars.into_iter().collect()
-}
-
-/// An iterator over references to rules.
-pub struct RulesIter<'a> {
-    selector: Option<&'a Selector>,
-    inner: std::slice::Iter<'a, Rule>,
-}
-
-impl<'a> Iterator for RulesIter<'a> {
-    type Item = &'a Rule;
-    fn next(&mut self) -> Option<Self::Item> {
-        let selector = self.selector.as_ref();
-
-        self.inner
-            .find(|rule| selector.map_or(true, |s| s.is_match(rule.id())))
-    }
-}
-
-/// An iterator over mutable references to rules.
-pub struct RulesIterMut<'a> {
-    selector: Option<&'a Selector>,
-    inner: std::slice::IterMut<'a, Rule>,
-}
-
-impl<'a> Iterator for RulesIterMut<'a> {
-    type Item = &'a mut Rule;
-    fn next(&mut self) -> Option<Self::Item> {
-        let selector = self.selector.as_ref();
-
-        self.inner
-            .find(|rule| selector.map_or(true, |s| s.is_match(rule.id())))
-    }
-}
-
-impl IntoIterator for Rules {
-    type Item = Rule;
-    type IntoIter = std::vec::IntoIter<Rule>;
-    fn into_iter(self) -> Self::IntoIter {
-        self.rules.into_iter()
-    }
-}
-
-impl<R> FromIterator<R> for Rules
-where
-    R: Into<Rule>,
-{
-    fn from_iter<I: IntoIterator<Item = R>>(iter: I) -> Self {
-        let rules: Vec<Rule> = iter.into_iter().map(|x| x.into()).collect();
-        Self { rules }
-    }
-}
diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs
deleted file mode 100644
index 6076195..0000000
--- a/nlprule/src/tokenizer.rs
+++ /dev/null
@@ -1,396 +0,0 @@
-//! A tokenizer to split raw text into tokens.
-//! Tokens are assigned lemmas and part-of-speech tags by lookup from a [Tagger][tag::Tagger] and chunks containing
-//! information about noun / verb and grammatical case by a statistical [Chunker][chunk::Chunker].
-//! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by
-//! [DisambiguationRule][crate::rule::DisambiguationRule]s.
-
-use crate::{
-    rule::id::{Index, Selector},
-    rule::MatchSentence,
-    types::*,
-    utils::{parallelism::MaybeParallelRefIterator, regex::Regex},
-    Error,
-};
-use fs_err::File;
-use serde::{Deserialize, Serialize};
-use std::{
-    io::{BufReader, Read, Write},
-    ops::Range,
-    path::Path,
-    sync::Arc,
-};
-
-pub mod chunk;
-pub mod multiword;
-pub mod tag;
-
-use chunk::Chunker;
-use multiword::MultiwordTagger;
-use tag::Tagger;
-
-use crate::rule::DisambiguationRule;
-
-/// Split a text at the points where the given function is true.
-/// Keeps the separators. See https://stackoverflow.com/a/40296745.
-fn split<F>(text: &str, split_func: F) -> Vec<&str>
-where
-    F: Fn(char) -> bool,
-{
-    let mut result = Vec::new();
-    let mut last = 0;
-    for (index, matched) in text.match_indices(split_func) {
-        if last != index {
-            result.push(&text[last..index]);
-        }
-        result.push(matched);
-        last = index + matched.len();
-    }
-    if last < text.len() {
-        result.push(&text[last..]);
-    }
-
-    result
-}
-
-/// Options for a tokenizer.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub(crate) struct TokenizerLangOptions {
-    /// Whether to allow errors while constructing the tokenizer.
-    pub allow_errors: bool,
-    /// Disambiguation Rule selectors to use in this tokenizer.
-    #[serde(default)]
-    pub ids: Vec<Selector>,
-    /// Disambiguation Rule selectors to ignore in this tokenizer.
-    #[serde(default)]
-    pub ignore_ids: Vec<Selector>,
-    /// Specific examples in the notation `{id}:{example_index}` which are known to fail.
-    #[serde(default)]
-    pub known_failures: Vec<String>,
-    /// Extra language-specific characters to split text on.
-    #[serde(default)]
-    pub extra_split_chars: Vec<char>,
-    /// Extra language-specific Regexes of which the matches will *not* be split into multiple tokens.
-    #[serde(default)]
-    pub extra_join_regexes: Vec<Regex>,
-}
-
-impl Default for TokenizerLangOptions {
-    fn default() -> Self {
-        TokenizerLangOptions {
-            allow_errors: false,
-            ids: Vec::new(),
-            ignore_ids: Vec::new(),
-            known_failures: Vec::new(),
-            extra_split_chars: Vec::new(),
-            extra_join_regexes: Vec::new(),
-        }
-    }
-}
-
-/// An iterator over [IncompleteSentence]s. Has the same properties as [SentenceIter].
-pub struct IncompleteSentenceIter<'t> {
-    text: &'t str,
-    splits: Vec<Range<usize>>,
-    tokenizer: &'t Tokenizer,
-    index: usize,
-    position: Position,
-}
-
-impl<'t> Iterator for IncompleteSentenceIter<'t> {
-    type Item = Sentence<'t>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.index == self.splits.len() {
-            return None;
-        }
-
-        let mut range = self.splits[self.index].clone();
-        self.index += 1;
-
-        // as long as the current sentence contains only whitespace, add the next sentence
-        // in practice, this might never happen, but we can not make any assumption about
-        // SRX rule behavior here.
-        while self.text[range.clone()].trim().is_empty() && self.index < self.splits.len() {
-            range.end = self.splits[self.index].end;
-            self.index += 1;
-        }
-
-        let sentence = self
-            .tokenizer
-            .tokenize(&self.text[range.clone()])
-            .map(|x| x.rshift(self.position));
-
-        self.position += Position {
-            char: self.text[range.clone()].chars().count(),
-            byte: range.len(),
-        };
-
-        sentence
-    }
-}
-
-/// An iterator over [Sentence]s. Has some key properties:
-/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero.
-/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`.
-/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence.
-pub struct SentenceIter<'t> {
-    inner: IncompleteSentenceIter<'t>,
-    tokenizer: &'t Tokenizer,
-}
-
-impl<'t> Iterator for SentenceIter<'t> {
-    type Item = Sentence<'t>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.inner
-            .next()
-            .map(|sentence| self.tokenizer.disambiguate(sentence))
-    }
-}
-
-/// The complete Tokenizer doing tagging, chunking and disambiguation.
-#[derive(Serialize, Deserialize, Default, Clone)]
-pub struct Tokenizer {
-    pub(crate) rules: Vec<DisambiguationRule>,
-    pub(crate) chunker: Option<Chunker>,
-    pub(crate) sentencizer: srx::Rules,
-    pub(crate) multiword_tagger: Option<MultiwordTagger>,
-    pub(crate) tagger: Arc<Tagger>,
-    pub(crate) lang_options: TokenizerLangOptions,
-}
-
-impl Tokenizer {
-    /// Creates a new tokenizer from a path to a binary.
-    ///
-    /// # Errors
-    /// - If the file can not be opened.
-    /// - If the file content can not be deserialized to a rules set.
-    pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
-        let reader = BufReader::new(File::open(p.as_ref())?);
-        Ok(bincode::deserialize_from(reader)?)
-    }
-
-    /// Creates a new tokenizer from a reader.
-    pub fn from_reader<R: Read>(reader: R) -> Result<Self, Error> {
-        Ok(bincode::deserialize_from(reader)?)
-    }
-
-    /// Serializes this rules set to a writer.
-    pub fn to_writer<W: Write>(&self, writer: W) -> Result<(), Error> {
-        Ok(bincode::serialize_into(writer, &self)?)
-    }
-
-    /// Gets all disambigation rules in the order they are applied.
-    pub fn rules(&self) -> &[DisambiguationRule] {
-        &self.rules
-    }
-
-    /// Gets the lexical tagger.
-    pub fn tagger(&self) -> &Arc<Tagger> {
-        &self.tagger
-    }
-
-    /// Gets the chunker if one exists.
-    pub fn chunker(&self) -> &Option<Chunker> {
-        &self.chunker
-    }
-
-    pub(crate) fn lang_options(&self) -> &TokenizerLangOptions {
-        &self.lang_options
-    }
-
-    pub(crate) fn disambiguate_up_to_id<'t>(
-        &'t self,
-        mut sentence: Sentence<'t>,
-        id: Option<&Index>,
-    ) -> Sentence<'t> {
-        let n = id.map_or(self.rules.len(), |id| {
-            self.rules.iter().position(|x| x.id == *id).unwrap()
-        });
-        let mut i = 0;
-
-        while i < n {
-            let match_sentence = MatchSentence::new(&sentence);
-
-            let result = self.rules[i..n]
-                .maybe_par_iter()
-                .enumerate()
-                .filter_map(|(j, rule)| {
-                    let changes = rule.apply(&match_sentence);
-                    if changes.is_empty() {
-                        None
-                    } else {
-                        Some((j + i, changes))
-                    }
-                })
-                .find_first(|_| true);
-
-            if let Some((index, changes)) = result {
-                self.rules[index].change(&mut sentence, changes);
-                i = index + 1;
-            } else {
-                i = n;
-            }
-        }
-
-        sentence
-    }
-
-    /// Apply rule-based disambiguation to the tokens.
-    /// This does not change the number of tokens, but can change the content arbitrarily.
-    pub fn disambiguate<'t>(&'t self, sentence: Sentence<'t>) -> Sentence<'t> {
-        self.disambiguate_up_to_id(sentence, None)
-    }
-
-    fn get_token_ranges<'t>(
-        &self,
-        text: &'t str,
-    ) -> impl ExactSizeIterator<Item = Range<usize>> + 't + Clone {
-        let mut tokens = Vec::new();
-
-        let split_char = |c: char| c.is_whitespace() || crate::utils::splitting_chars().contains(c);
-        let split_text = |text: &'t str| {
-            let mut tokens = Vec::new();
-            for pretoken in split(text, split_char) {
-                // if the token is in the dictionary, we add it right away
-                if self.tagger.id_word(pretoken.into()).1.is_some() {
-                    tokens.push(pretoken);
-                } else {
-                    // otherwise, potentially split it again with `extra_split_chars` e. g. "-"
-                    tokens.extend(split(pretoken, |c| {
-                        split_char(c) || self.lang_options.extra_split_chars.contains(&c)
-                    }));
-                }
-            }
-            tokens
-        };
-
-        let mut joined_mask = vec![false; text.len()];
-        let mut joins = Vec::new();
-
-        for regex in self.lang_options.extra_join_regexes.iter() {
-            for mat in regex.find_iter(text) {
-                if !joined_mask[mat.start()..mat.end()].iter().any(|x| *x) {
-                    joins.push(mat.start()..mat.end());
-                    joined_mask[mat.start()..mat.end()]
-                        .iter_mut()
-                        .for_each(|x| *x = true);
-                }
-            }
-        }
-
-        joins.sort_by(|a, b| a.start.cmp(&b.start));
-
-        let mut prev = 0;
-        for range in joins {
-            tokens.extend(split_text(&text[prev..range.start]));
-            prev = range.end;
-            tokens.push(&text[range]);
-        }
-
-        tokens.extend(split_text(&text[prev..text.len()]));
-        tokens.into_iter().map(move |token| {
-            let byte_start = (token.as_ptr() as usize)
-                .checked_sub(text.as_ptr() as usize)
-                .expect("Each token str is a slice of the text str.");
-
-            byte_start..byte_start + token.len()
-        })
-    }
-
-    /// Tokenize the given sentence. This applies chunking and tagging, but does not do disambiguation.
-    // NB: this is not public because it could be easily misused by passing a text instead of one sentence.
-    pub(crate) fn tokenize<'t>(&'t self, sentence: &'t str) -> Option<Sentence<'t>> {
-        if sentence.trim().is_empty() {
-            return None;
-        }
-
-        let token_strs = self
-            .get_token_ranges(sentence)
-            .filter(|range| !sentence[range.clone()].trim().is_empty());
-
-        let n_token_strs = token_strs.clone().count();
-
-        let tokens: Vec<_> = token_strs
-            .enumerate()
-            .map(|(i, range)| {
-                let byte_start = range.start;
-                let char_start = sentence[..byte_start].chars().count();
-
-                let token_text = sentence[range].trim();
-
-                let is_sentence_start = i == 0;
-                let is_sentence_end = i == n_token_strs - 1;
-
-                let id = self.tagger.id_word(token_text.into());
-
-                let mut tag_vec: Vec<_> = self
-                    .tagger
-                    .get_tags_with_options(
-                        token_text,
-                        if is_sentence_start { Some(true) } else { None },
-                        None,
-                    )
-                    .collect();
-
-                tag_vec.push(
-                    WordData::new(
-                        self.tagger().id_word(token_text.into()),
-                        PosId::special(SpecialPos::None),
-                    )
-                    .freeze(),
-                );
-
-                if is_sentence_end {
-                    tag_vec.push(
-                        WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd))
-                            .freeze(),
-                    );
-                }
-
-                Token::new(
-                    id,
-                    Tags::new(tag_vec),
-                    Span::new(
-                        byte_start..byte_start + token_text.len(),
-                        char_start..char_start + token_text.chars().count(),
-                    ),
-                    is_sentence_end,
-                    sentence[..byte_start].ends_with(char::is_whitespace),
-                    Vec::new(),
-                )
-            })
-            .collect();
-
-        let mut sentence = Sentence::new(tokens, sentence, &self.tagger);
-
-        if let Some(chunker) = &self.chunker {
-            chunker.apply(&mut sentence);
-        }
-
-        if let Some(multiword_tagger) = &self.multiword_tagger {
-            multiword_tagger.apply(&mut sentence);
-        }
-
-        Some(sentence)
-    }
-
-    /// Splits the text into sentences and tokenizes each sentence.
-    pub fn sentencize<'t>(&'t self, text: &'t str) -> IncompleteSentenceIter<'t> {
-        IncompleteSentenceIter {
-            text,
-            splits: self.sentencizer.split_ranges(text),
-            tokenizer: &self,
-            index: 0,
-            position: Position::default(),
-        }
-    }
-
-    /// Applies the entire tokenization pipeline including sentencization, tagging, chunking and disambiguation.
-    pub fn pipe<'t>(&'t self, text: &'t str) -> SentenceIter<'t> {
-        SentenceIter {
-            inner: self.sentencize(text),
-            tokenizer: &self,
-        }
-    }
-}
diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs
index d89809e..7f66716 100644
--- a/nlprule/src/types.rs
+++ b/nlprule/src/types.rs
@@ -1,8 +1,7 @@
 //! Fundamental types used by this crate.
 
-use crate::tokenizer::tag::Tagger;
-pub use crate::tokenizer::tag::{PosId, WordId};
-pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt};
+pub(crate) use crate::components::tagger::{PosId, SpecialPos, WordId, WordIdInt};
+use crate::{components::tagger::Tagger, properties::Property};
 use derivative::Derivative;
 use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
@@ -69,6 +68,12 @@ impl<'t> Sentence<'t> {
         &self.tokens
     }
 
+    /// Gets the first token in this sentence. There is always at least one token in the sentence
+    /// so this will never panic.
+    pub fn first(&self) -> &Token<'t> {
+        &self.tokens[0]
+    }
+
     /// Gets the amount of tokens in this sentence.
     pub fn len(&self) -> usize {
         self.tokens.len()
@@ -177,15 +182,20 @@ impl<'a, 't> Iterator for TagIter<'a, 't> {
 
 /// Contains all the local information about a token i. e.
 /// the text itself and the [WordData]s associated with the word.
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
 pub struct Tags<'t> {
+    id: WordId<'t>,
     tags: Vec<WordData<'t>>,
 }
 
 impl<'t> Tags<'t> {
     /// Creates new [Tags].
-    pub fn new(tags: Vec<WordData<'t>>) -> Self {
-        Tags { tags }
+    pub fn new(id: WordId<'t>, tags: Vec<WordData<'t>>) -> Self {
+        Tags { id, tags }
+    }
+
+    pub fn id(&self) -> &WordId<'t> {
+        &self.id
     }
 
     /// Multiple pairs of (lemma, part-of-speech) associated with this token.
@@ -221,59 +231,64 @@ impl<'t> Tags<'t> {
     /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data.
     pub fn into_static(self) -> Tags<'static> {
         Tags {
+            id: self.id.into_static(),
             tags: self.tags.into_iter().map(|x| x.into_static()).collect(),
         }
     }
 }
 
+lazy_static! {
+    pub(crate) static ref SENT_START: Token<'static> = Token {
+        text: "",
+        span: Span::default(),
+        is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence.
+        is_sentence_end: false,
+        has_space_before: false,
+        tags: Some(Tags::new(
+            WordId::empty(),
+            vec![WordData::new(
+                WordId::empty(),
+                PosId::special(SpecialPos::SentStart),
+            )],
+        )),
+        chunks: Some(Vec::new()),
+    };
+}
+
 /// A token where varying levels of information are set.
 #[derive(Debug, Clone, PartialEq)]
 pub struct Token<'t> {
-    text: WordId<'t>,
-    tags: Tags<'t>,
+    text: &'t str,
     span: Span,
+    is_sentence_start: bool,
     is_sentence_end: bool,
     has_space_before: bool,
-    chunks: Vec<String>,
+    pub tags: Option<Tags<'t>>,
+    pub chunks: Option<Vec<String>>,
 }
 
 impl<'t> Token<'t> {
     pub(crate) fn new(
-        text: WordId<'t>,
-        tags: Tags<'t>,
+        text: &'t str,
         span: Span,
+        is_sentence_start: bool,
         is_sentence_end: bool,
         has_space_before: bool,
-        chunks: Vec<String>,
     ) -> Self {
         Token {
             text,
-            tags,
             span,
+            is_sentence_start,
             is_sentence_end,
             has_space_before,
-            chunks,
+            tags: None,
+            chunks: None,
         }
     }
 
-    /// Gets the word id for this token.
-    pub fn text(&self) -> &WordId<'t> {
-        &self.text
-    }
-
     /// Gets the token as string.
-    pub fn as_str(&self) -> &str {
-        self.text.as_str()
-    }
-
-    /// The tags of this token. Contain information about the part-of-speech tags and lemmas.
-    pub fn tags(&self) -> &Tags<'t> {
-        &self.tags
-    }
-
-    #[allow(missing_docs)]
-    pub fn tags_mut(&mut self) -> &mut Tags<'t> {
-        &mut self.tags
+    pub fn as_str(&self) -> &'t str {
+        self.text
     }
 
     /// The span of this sentence.
@@ -281,7 +296,12 @@ impl<'t> Token<'t> {
         &self.span
     }
 
-    /// Whether this token is the last token in the sentence-
+    /// Whether this token is the first token in the sentence.
+    pub fn is_sentence_start(&self) -> bool {
+        self.is_sentence_start
+    }
+
+    /// Whether this token is the last token in the sentence.
     pub fn is_sentence_end(&self) -> bool {
         self.is_sentence_end
     }
@@ -291,32 +311,37 @@ impl<'t> Token<'t> {
         self.has_space_before
     }
 
-    /// Chunks associated with this token.
-    pub fn chunks(&self) -> &[String] {
-        &self.chunks
-    }
-
-    #[allow(missing_docs)]
-    pub fn chunks_mut(&mut self) -> &mut Vec<String> {
-        &mut self.chunks
-    }
-
     /// Shift the span of this token right by the specified amount.
     pub fn rshift(mut self, position: Position) -> Self {
         self.span = self.span.rshift(position);
         self
     }
+}
 
-    /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data.
-    pub fn into_static(self) -> Token<'static> {
-        Token {
-            text: self.text.into_static(),
-            tags: self.tags.into_static(),
-            span: self.span,
-            is_sentence_end: self.is_sentence_end,
-            has_space_before: self.has_space_before,
-            chunks: self.chunks,
-        }
+impl<'t> Token<'t> {
+    /// The tags of this token. Contain information about the part-of-speech tags and lemmas.
+    pub fn tags(&self) -> Result<&Tags<'t>, crate::Error> {
+        self.tags
+            .as_ref()
+            .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into())
+    }
+
+    pub fn tags_mut(&mut self) -> Result<&mut Tags<'t>, crate::Error> {
+        self.tags
+            .as_mut()
+            .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into())
+    }
+
+    pub fn chunks(&self) -> Result<&[String], crate::Error> {
+        self.chunks
+            .as_deref()
+            .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into())
+    }
+
+    pub fn chunks_mut(&mut self) -> Result<&mut Vec<String>, crate::Error> {
+        self.chunks
+            .as_mut()
+            .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into())
     }
 }
 
diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs
index 7d08956..6408c96 100644
--- a/nlprule/tests/tests.rs
+++ b/nlprule/tests/tests.rs
@@ -1,20 +1,17 @@
 use std::convert::TryInto;
 
 use lazy_static::lazy_static;
-use nlprule::{rule::id::Category, types::Position, Rules, Tokenizer};
+use nlprule::{lang::en, properties::*, rule::id::Category, types::Position};
 use quickcheck_macros::quickcheck;
 
-const TOKENIZER_PATH: &str = "../storage/en_tokenizer.bin";
-const RULES_PATH: &str = "../storage/en_rules.bin";
-
 lazy_static! {
-    static ref TOKENIZER: Tokenizer = Tokenizer::new(TOKENIZER_PATH).unwrap();
-    static ref RULES: Rules = Rules::new(RULES_PATH).unwrap();
+    static ref ANALYZER: en::Analyzer = en::analyzer();
+    static ref CORRECTER: en::Correcter = en::correcter();
 }
 
 #[test]
-fn can_tokenize_empty_text() {
-    let sentences: Vec<_> = TOKENIZER.pipe("").collect();
+fn can_analyze_empty_text() {
+    let sentences: Vec<_> = ANALYZER.tokenize("").collect();
     assert!(sentences.is_empty());
 }
 
@@ -23,7 +20,7 @@ fn handles_whitespace_correctly() {
     // preceding whitespace has to be included, trailing whitespace behavior is unspecified
     let text = "  hello.\ttest.\t\t";
 
-    let mut sentences = TOKENIZER.pipe(text);
+    let mut sentences = ANALYZER.tokenize(text);
     assert_eq!(
         &text[sentences.next().unwrap().span().byte().clone()],
         "  hello.\t"
@@ -32,21 +29,21 @@ fn handles_whitespace_correctly() {
         &text[sentences.next().unwrap().span().byte().clone()],
         "test.\t"
     );
-    assert_eq!(sentences.next(), None);
+    assert!(sentences.next().is_none());
 }
 
 #[quickcheck]
-fn can_tokenize_anything(text: String) -> bool {
-    let _: Vec<_> = TOKENIZER.pipe(&text).collect();
+fn can_analyze_anything(text: String) -> bool {
+    let _: Vec<_> = ANALYZER.tokenize(&text).collect();
     true
 }
 
 #[test]
 fn suggest_indices_are_relative_to_input_text() {
-    let suggestions = RULES.suggest(
-        "I can due his homework for 10€. I can due his homework.",
-        &*TOKENIZER,
-    );
+    let suggestions: Vec<_> = CORRECTER
+        .suggest("I can due his homework for 10€. I can due his homework.")
+        .flatten()
+        .collect();
 
     assert_eq!(*suggestions[0].span().char(), 6..9);
     assert_eq!(*suggestions[0].span().byte(), 6..9);
@@ -62,7 +59,7 @@ fn suggest_indices_are_relative_to_input_text() {
 fn sentence_spans_correct() {
     let text = "A short test. A test with emoji 😊.";
 
-    let sentences: Vec<_> = TOKENIZER.pipe(text).collect();
+    let sentences: Vec<_> = ANALYZER.tokenize(text).collect();
     assert_eq!(sentences.len(), 2);
 
     assert_eq!(*sentences[0].span().char(), 0..14);
@@ -76,8 +73,8 @@ fn sentence_spans_correct() {
 fn token_spans_correct() {
     let text = "A short test. A test with emoji 😊.";
 
-    let tokens: Vec<_> = TOKENIZER
-        .pipe(text)
+    let tokens: Vec<_> = ANALYZER
+        .tokenize(text)
         .map(|x| x.into_iter())
         .flatten()
         .collect();
@@ -99,7 +96,7 @@ fn no_gaps_between_sentences(text: String) {
     let mut prev_pos = Position::default();
     let mut contains_sentence = false;
 
-    for sentence in TOKENIZER.pipe(&text) {
+    for sentence in ANALYZER.tokenize(&text) {
         assert_eq!(sentence.span().start(), prev_pos);
         prev_pos += sentence.span().len();
 
@@ -111,14 +108,18 @@ fn no_gaps_between_sentences(text: String) {
 
 #[test]
 fn rules_can_be_disabled_enabled() {
-    let mut rules = Rules::new(RULES_PATH).unwrap();
+    let mut correcter = CORRECTER.clone();
 
     // enabled by default
-    assert!(!rules
-        .suggest("I can due his homework", &*TOKENIZER)
-        .is_empty());
+    assert!(correcter
+        .suggest("I can due his homework")
+        .flatten()
+        .next()
+        .is_some());
 
-    rules
+    correcter
+        .components_mut()
+        .1
         .select_mut(
             &Category::new("confused_words")
                 .join("confusion_due_do")
@@ -127,17 +128,28 @@ fn rules_can_be_disabled_enabled() {
         .for_each(|x| x.disable());
 
     // disabled now
-    assert!(rules
-        .suggest("I can due his homework", &*TOKENIZER)
-        .is_empty());
+    assert!(correcter
+        .suggest("I can due his homework")
+        .flatten()
+        .next()
+        .is_none());
 
     // disabled by default
-    assert!(rules.suggest("I can not go", &*TOKENIZER).is_empty());
+    assert!(correcter.suggest("I can not go").flatten().next().is_none());
 
-    rules
+    correcter
+        .components_mut()
+        .1
         .select_mut(&"typos/can_not".try_into().unwrap())
         .for_each(|x| x.enable());
 
     // enabled now
-    assert!(!rules.suggest("I can not go", &*TOKENIZER).is_empty());
+    assert!(correcter.suggest("I can not go").flatten().next().is_some());
+}
+
+#[test]
+fn pipelines_work_with_references() -> Result<(), crate::Error> {
+    let _pipeline = Pipeline::new((&*ANALYZER, &CORRECTER.components().1))?;
+
+    Ok(())
 }
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 49d1e28..ccb310e 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -182,6 +182,7 @@ impl PyToken {
     fn data(&self) -> Vec<(&str, &str)> {
         self.token
             .tags()
+            .unwrap()
             .iter()
             .map(|x| (x.lemma().as_str(), x.pos().as_str()))
             .collect()
@@ -192,6 +193,7 @@ impl PyToken {
         let mut lemmas: Vec<_> = self
             .token
             .tags()
+            .unwrap()
             .iter()
             .filter_map(|x| {
                 if x.lemma().as_str().is_empty() {
@@ -211,6 +213,7 @@ impl PyToken {
         let mut tags: Vec<_> = self
             .token
             .tags()
+            .unwrap()
             .iter()
             .filter_map(|x| {
                 if x.pos().as_str().is_empty() {
@@ -227,7 +230,12 @@ impl PyToken {
 
     #[getter]
     fn chunks(&self) -> Vec<&str> {
-        self.token.chunks().iter().map(|x| x.as_str()).collect()
+        self.token
+            .chunks()
+            .unwrap()
+            .iter()
+            .map(|x| x.as_str())
+            .collect()
     }
 }
 
@@ -355,6 +363,7 @@ impl PyTokenizer {
                 .pipe(&text)
                 .map(|sentence| {
                     sentence
+                        .unwrap()
                         .into_iter()
                         .map(|token| PyCell::new(py, PyToken::from(token.into_static())))
                         .collect::<PyResult<Vec<_>>>()
@@ -619,6 +628,7 @@ impl PyRules {
             self.rules
                 .read()
                 .suggest(&sentence, &tokenizer)
+                .unwrap()
                 .into_iter()
                 .map(|x| PyCell::new(py, PySuggestion::from(x)))
                 .collect::<PyResult<Vec<_>>>()
@@ -639,7 +649,7 @@ impl PyRules {
             let tokenizer = self.tokenizer.borrow(py);
             let tokenizer = tokenizer.tokenizer();
 
-            Ok(self.rules.read().correct(&text, tokenizer))
+            Ok(self.rules.read().correct(&text, tokenizer).unwrap())
         })
     }
 
diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh
index bb18ed8..5b28e3b 100755
--- a/scripts/build_and_test.sh
+++ b/scripts/build_and_test.sh
@@ -4,26 +4,35 @@ then
   exit
 fi
 
-# this script assumes the build directories are in data/
-# only for convenience
-mkdir -p storage
+set -e
 
-# x-- => only compile
-# -xx => test_disambiguation and test
-# xxx or flags not set => everything
-flags=${2:-"xxx"}
+mkdir -p nlprule/src/storage
 
-if [ "${flags:0:1}" == "x" ] 
-then
-    RUST_LOG=INFO cargo run --all-features --bin compile -- --build-dir data/$1 --tokenizer-out storage/$1_tokenizer.bin --rules-out storage/$1_rules.bin
+cd data
+
+# download + extract the build directory from backblaze if we don't have it yet
+if [ ! -f $1.zip ]; then
+  wget https://f000.backblazeb2.com/file/nlprule/$1.zip
+  unzip -o $1.zip
 fi
 
-if [ "${flags:1:1}" == "x" ] 
+cd ..
+
+# x- => only compile
+# -x => only test
+# xx or flags not set => everything
+flags=${2:-"xx"}
+
+if [ "${flags:0:1}" == "x" ] 
 then
-    RUST_LOG=WARN cargo run --all-features --bin test_disambiguation -- --tokenizer storage/$1_tokenizer.bin
+    cd nlprule
+    RUST_LOG=INFO cargo run --features "compile bin" --bin compile -- --build-dir ../data/$1 --out-dir storage/$1
+    cd ..
 fi
 
-if [ "${flags:2:1}" == "x" ] 
+if [ "${flags:1:1}" == "x" ] 
 then
-    RUST_LOG=WARN cargo run --all-features --bin test -- --tokenizer storage/$1_tokenizer.bin --rules storage/$1_rules.bin
+    cd nlprule
+    RUST_LOG=INFO cargo run --no-default-features --features "bin binaries-$1 regex-all-test" --bin test_$1
+    cd ..
 fi
\ No newline at end of file