s3-school · vuillaut · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/.github/workflows/deliver.yml b/.github/workflows/deliver.yml
@@ -0,0 +1,29 @@
+name: Deliver results
+on: [push] # for now,  to get latest results
+
+jobs:
+  make-plot:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout the code
+        uses: actions/checkout@v6
+
+      - name: Install pixi
+        uses: prefix-dev/[email protected]
+        with:
+          pixi-version: v0.62.2 # good to pin this since pixi is not 1.0 yet!
+
+      - name: run analysis
+        run: |
+          cd analysis
+          pixi run pkoffee analyze --data-file coffee_productivity.csv --output fitted_models.toml --show-rankings
+          pixi run pkoffee plot --data-file coffee_productivity.csv --models fitted_models.toml --output analysis.png --y-max 7
+
+      - name: upload results
+        uses: actions/upload-artifact@v6
+        with:
+          name: results
+          path: |
+            analysis/analysis.png
+            analysis/fitted_models.toml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,46 @@
+name: Test Matrix
+on: [push]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout the code
+        uses: actions/checkout@v6
+
+      - name: Install pixi
+        uses: prefix-dev/[email protected]
+        with:
+          pixi-version: v0.62.2 # good to pin this since pixi is not 1.0 yet!
+
+      # - name: look for syntax errors with ruff
+      #   run: pixi run ruff check
+
+  unit-tests:
+    runs-on: ${{ matrix.os }}
+    needs: code-quality # don't bother running tests if code-checks fails.
+
+    strategy:
+      fail-fast: false # don't stop all jobs if one fails
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        environment:
+          - test
+          - test314
+          - test312
+
+    steps:
+      - name: Checkout the code
+        uses: actions/checkout@v6
+
+      - name: Install pixi
+        uses: prefix-dev/[email protected]
+        with:
+          pixi-version: v0.62.2
+          environments: ${{ matrix.environment }}
+
+      - run: pixi run -e ${{ matrix.environment }} pytest
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -11,6 +11,8 @@ python = ["3.12.*", "3.13.*", "3.14.*"]
 
 [dependencies]
 pkoffee = { path = "." }
+numpy = ">=2.4.1,<3"
+matplotlib = ">=3.10.8,<4"
 
 [package]
 name = "pkoffee"
@@ -61,14 +63,37 @@ python=">=3.14,<3.15"
 [feature.dev.dependencies]
 jupyterlab = ">=4.4.3,<5"
 
+[feature.test.dependencies]
+# example task converts notebooks to markdown to include so they can be included in documentation
+nbconvert = ">=7.16.6,<8"
+pytest = ">=9,<10"
+pytest-cov = ">=7,<8"
+
+[feature.test.tasks.test]
+args = [
+    { "arg" = "coverage_dir", "default" = "htmlcov" },
+    { "arg" = "cobertura_report", "default" = "cobertura_report.xml" },
+    { "arg" = "junit_report", "default" = "junit_report.xml"}
+]
+cmd = "pytest -vv --cov-report=html:{{ coverage_dir }} --cov-report=xml:{{ cobertura_report }} --junitxml={{ junit_report }}"
+
+
+[tasks]
+analyse = "pkoffee analyze --data-file analysis/coffee_productivity.csv --output analysis/fitted_models.toml --show-rankings"
+plot = "pkoffee plot --data-file analysis/coffee_productivity.csv --models analysis/fitted_models.toml --output analysis/analysis.png --y-max 7"
+
+
 [environments]
 prod = { features = ["py313"], solve-group = "prod" }
+test = { features = ["test", "py313"], solve-group = "prod" }
 # use default for the dev environment so tasks are resolved in this environment (avoids having to specify env)
 # no need to include doc: pixi will figure the environment to use from the task if it is unique
 default = { features = ["dev", "py313"], solve-group = "prod" }
 
 # python 3.14 environments for testing
 prod314 = { features = ["py314"], solve-group = "prod314" }
+test314 = { features = ["test", "py314"], solve-group = "prod314" }
 
 # python 3.12 environments for testing
 prod312 = { features = ["py312"], solve-group = "prod312" }
+test312 = { features = ["test", "py312"], solve-group = "prod312" }
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+"""Unit tests initialization."""
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -0,0 +1,140 @@
+"""Unit tests for data loading and preprocessing."""
+
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from pkoffee.data import (
+    ColumnTypeError,
+    CSVReadError,
+    MissingColumnsError,
+    RequiredColumn,
+    curate,
+    data_dtype,
+    extract_arrays,
+    load_csv,
+    validate,
+)
+
+
+def test_validate() -> None:
+    """Test validate with valide DataFrame."""
+    assert validate(pd.DataFrame({"cups": [0], "productivity": [1.2]})) is None
+
+
+def test_validate_wrong_type() -> None:
+    """Test validate with incorrect required column type."""
+    with pytest.raises(ColumnTypeError):
+        validate(pd.DataFrame({"cups": [0], "productivity": ["a"]}))
+
+
+def test_validate_missing_column() -> None:
+    """Test validate with missing required column in DataFrame."""
+    with pytest.raises(MissingColumnsError):
+        validate(pd.DataFrame({f"{RequiredColumn.CUPS}": [1], "notproductivity": [1.2]}))
+
+
+@pytest.mark.parametrize(
+    ("data", "expected"),
+    [
+        (
+            pd.DataFrame({"cups": [1, np.nan, 2], "productivity": [1.2, 2.1, np.nan]}),
+            pd.DataFrame({"cups": [1.0], "productivity": [1.2]}),
+        ),
+        (
+            pd.DataFrame({"cups": [np.nan, np.nan, np.nan], "productivity": [1.2, 2.1, 3.4]}),
+            pd.DataFrame({"cups": [], "productivity": []}),
+        ),
+        (
+            pd.DataFrame({"cups": [1, 1, 4], "productivity": [1.2, 2.1, 0.5]}),
+            pd.DataFrame({"cups": [1, 1, 4], "productivity": [1.2, 2.1, 0.5]}),
+        ),
+    ],
+)
+def test_currate(data: pd.DataFrame, expected: pd.DataFrame) -> None:
+    """Test curate with different DataFrames containing nans."""
+    assert curate(data).equals(expected)
+
+
+def test_load_csv_valid_file(tmp_path: Path) -> None:
+    """Test loading valid CSV."""
+    data_file = tmp_path / "valid.csv"
+    cups = np.array([1, 2, 3], dtype=int)
+    prod = np.array([2.3, 1.2, 4.8], dtype=data_dtype)
+    np.savetxt(
+        data_file,
+        np.stack([cups, prod], axis=1),
+        fmt=["%d", "%10.4f"],
+        delimiter=",",
+        header=f"{RequiredColumn.CUPS},{RequiredColumn.PRODUCTIVITY}",
+        comments="",
+    )
+
+    data = load_csv(data_file)
+    assert RequiredColumn.CUPS in data.columns
+    assert RequiredColumn.PRODUCTIVITY in data.columns
+    assert np.isclose(data[RequiredColumn.CUPS].to_numpy(), cups).all()
+    assert np.isclose(data[RequiredColumn.PRODUCTIVITY].to_numpy(), prod).all()
+    assert data.dtypes[RequiredColumn.CUPS] == np.int64
+    assert data.dtypes[RequiredColumn.PRODUCTIVITY] == np.float64
+
+
+def test_load_csv_missing_file() -> None:
+    """Test that FileNotFoundError is raised for missing files."""
+    with pytest.raises(FileNotFoundError):
+        load_csv(Path("nonexistent_file.csv"))
+
+
+def test_load_csv_missing_columns(tmp_path: Path) -> None:
+    """Test MissingColumnsError is raised for missing required columns."""
+    wrong_col_file = tmp_path / "missing_columns.csv"
+    data = np.stack([[1], [2.3]], axis=1)
+    np.savetxt(
+        wrong_col_file,
+        data,
+        fmt=["%d", "%10.4f"],
+        delimiter=",",
+        header=f"{RequiredColumn.CUPS},wrong_column",
+        comments="",
+    )
+    with pytest.raises(MissingColumnsError, match="Missing required columns"):
+        load_csv(wrong_col_file)
+
+
+def test_load_data_with_nan_values(tmp_path: Path) -> None:
+    """Test that rows with NaN values are dropped."""
+    data_file = tmp_path / "valid_with_nan.csv"
+    with data_file.open("w") as fh:
+        fh.write(f"{RequiredColumn.CUPS},{RequiredColumn.PRODUCTIVITY}\n")
+        fh.write("1,10.5\n")
+        fh.write("2,\n")  # Missing productivity
+        fh.write("3,18.2\n")
+
+    data = load_csv(data_file)
+    expected = pd.DataFrame({RequiredColumn.CUPS: [1, 3], RequiredColumn.PRODUCTIVITY: [10.5, 18.2]})
+    assert data.equals(expected)
+
+
+def test_load_data_with_extra_values(tmp_path: Path) -> None:
+    """Test that rows with NaN values are dropped."""
+    data_file = tmp_path / "valid_with_nan.csv"
+    with data_file.open("w") as fh:
+        fh.write(f"{RequiredColumn.CUPS},{RequiredColumn.PRODUCTIVITY}\n")
+        fh.write("1,2.1\n")
+        # try to read the file while it is open for write
+        with pytest.raises(CSVReadError):
+            load_csv(data_file)
+
+
+def test_extract_arrays() -> None:
+    """Test extracting numpy arrays from DataFrame."""
+    cups_ref = np.array([1, 2, 3], dtype=int)
+    productivity_ref = np.array([10.5, 15.3, 18.2], dtype=np.float64)
+    data = pd.DataFrame({RequiredColumn.CUPS: cups_ref, RequiredColumn.PRODUCTIVITY: productivity_ref})
+
+    cups, productivity = extract_arrays(data)
+
+    assert np.allclose(cups_ref, cups)
+    assert np.allclose(productivity_ref, productivity)