Testing for OSNeuralSparseDocV3GTE download and load

ghukill · ghukill · commit 8899d5143744 · 2025-10-28T13:20:08.000-04:00
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,5 @@
+import json
+import logging
 import zipfile
 from pathlib import Path
 
@@ -6,6 +8,8 @@
 
 from embeddings.models.base import BaseEmbeddingModel
 
+logger = logging.getLogger(__name__)
+
 
 @pytest.fixture(autouse=True)
 def _test_env(monkeypatch):
@@ -32,8 +36,125 @@ def download(self, output_path: Path) -> Path:
             zf.writestr("tokenizer.json", '{"version": "1.0"}')
         return output_path
 
+    def load(self, model_path: str | Path) -> None:  # noqa: ARG002
+        logger.info("Model loaded successfully, 1.5s")
+
 
 @pytest.fixture
 def mock_model():
     """Fixture providing a MockEmbeddingModel instance."""
     return MockEmbeddingModel()
+
+
+@pytest.fixture
+def neural_sparse_doc_v3_gte_fake_model_directory(tmp_path):
+    """Create a fake downloaded model directory with required files."""
+    model_dir = tmp_path / "fake_model"
+    model_dir.mkdir()
+
+    # create config.json
+    config_json = {
+        "model_type": "distilbert",
+        "vocab_size": 30000,
+        "auto_map": {
+            "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
+            "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
+        },
+    }
+    (model_dir / "config.json").write_text(json.dumps(config_json))
+
+    # create modeling.py and configuration.py
+    (model_dir / "modeling.py").write_text("# mock modeling code")
+    (model_dir / "configuration.py").write_text("# mock configuration code")
+
+    # create tokenizer files
+    (model_dir / "tokenizer.json").write_text('{"version": "1.0"}')
+    (model_dir / "vocab.txt").write_text("word1\nword2\n")
+
+    return model_dir
+
+
+@pytest.fixture
+def neural_sparse_doc_v3_gte_mock_huggingface_snapshot(monkeypatch, tmp_path):
+    """Mock snapshot_download to create fake model files locally."""
+
+    def mock_snapshot(repo_id, local_dir, **kwargs):
+        """Create fake model files based on repo_id."""
+        local_path = Path(local_dir)
+        local_path.mkdir(parents=True, exist_ok=True)
+
+        if repo_id == "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte":
+            # create main model files
+            config_json = {
+                "model_type": "distilbert",
+                "vocab_size": 30000,
+                "auto_map": {
+                    "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
+                    "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
+                },
+            }
+            (local_path / "config.json").write_text(json.dumps(config_json))
+            (local_path / "pytorch_model.bin").write_bytes(b"fake weights")
+            (local_path / "tokenizer.json").write_text('{"version": "1.0"}')
+
+        elif repo_id == "Alibaba-NLP/new-impl":
+            # create alibaba dependency files
+            (local_path / "modeling.py").write_text("# Alibaba modeling code")
+            (local_path / "configuration.py").write_text("# Alibaba configuration code")
+
+        return str(local_path)
+
+    monkeypatch.setattr(
+        "embeddings.models.os_neural_sparse_doc_v3_gte.snapshot_download", mock_snapshot
+    )
+    return mock_snapshot
+
+
+@pytest.fixture
+def neural_sparse_doc_v3_gte_mock_transformers_models(monkeypatch):
+    """Mock AutoModelForMaskedLM and AutoTokenizer."""
+
+    class MockTokenizer:
+        """Mock tokenizer with necessary attributes."""
+
+        def __init__(self, *args, **kwargs):  # noqa: ARG002
+            self.vocab = {
+                "[CLS]": 0,
+                "[SEP]": 1,
+                "[PAD]": 2,
+                "word1": 3,
+                "word2": 4,
+            }
+            self.vocab_size = len(self.vocab)
+            self.special_tokens_map = {
+                "cls_token": "[CLS]",
+                "sep_token": "[SEP]",
+                "pad_token": "[PAD]",
+            }
+
+    class MockModel:
+        """Mock model with necessary attributes."""
+
+        def __init__(self, *args, **kwargs):  # noqa: ARG002
+            self.config = {"vocab_size": 30000}
+
+    class MockAutoTokenizer:
+        @staticmethod
+        def from_pretrained(*args, **kwargs):  # noqa: ARG004
+            return MockTokenizer()
+
+    class MockAutoModelForMaskedLM:
+        @staticmethod
+        def from_pretrained(*args, **kwargs):  # noqa: ARG004
+            return MockModel()
+
+    monkeypatch.setattr(
+        "embeddings.models.os_neural_sparse_doc_v3_gte.AutoTokenizer",
+        MockAutoTokenizer,
+    )
+    monkeypatch.setattr(
+        "embeddings.models.os_neural_sparse_doc_v3_gte.AutoModelForMaskedLM",
+        MockAutoModelForMaskedLM,
+    )
+
+    return {"tokenizer": MockTokenizer, "model": MockModel}
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py
@@ -0,0 +1,220 @@
+"""Tests for OSNeuralSparseDocV3GTE embedding model."""
+
+# ruff: noqa: SLF001, PLR2004
+
+import json
+from pathlib import Path
+
+import pytest
+
+from embeddings.models.os_neural_sparse_doc_v3_gte import OSNeuralSparseDocV3GTE
+
+
+def test_init():
+    """Test model initialization."""
+    model = OSNeuralSparseDocV3GTE()
+    assert model._model is None
+    assert model._tokenizer is None
+    assert model._special_token_ids is None
+    assert model._id_to_token is None
+
+
+def test_model_uri():
+    """Test model_uri property returns correct HuggingFace URI."""
+    model = OSNeuralSparseDocV3GTE()
+    assert (
+        model.model_uri
+        == "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
+    )
+    assert (
+        model.MODEL_URI
+        == "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
+    )
+
+
+def test_download_to_directory(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
+):
+    """Test download to directory (not zip)."""
+    model = OSNeuralSparseDocV3GTE()
+    output_path = tmp_path / "model_output"
+
+    result = model.download(output_path)
+
+    assert result == output_path
+    assert output_path.exists()
+    assert (output_path / "config.json").exists()
+    assert (output_path / "pytorch_model.bin").exists()
+    assert (output_path / "tokenizer.json").exists()
+
+
+def test_download_to_zip_file(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
+):
+    """Test download creates zip when path ends in .zip."""
+    model = OSNeuralSparseDocV3GTE()
+    output_path = tmp_path / "model.zip"
+
+    result = model.download(output_path)
+
+    assert result == output_path
+    assert output_path.exists()
+    assert output_path.suffix == ".zip"
+
+
+def test_download_calls_patch_method(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path, monkeypatch
+):
+    """Test that download calls the Alibaba patching method."""
+    model = OSNeuralSparseDocV3GTE()
+    output_path = tmp_path / "model_output"
+
+    patch_called = False
+
+    def mock_patch(temp_path):
+        nonlocal patch_called
+        patch_called = True
+
+    monkeypatch.setattr(model, "_patch_local_model_with_alibaba_new_impl", mock_patch)
+
+    model.download(output_path)
+
+    assert patch_called
+
+
+def test_download_returns_path(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
+):
+    """Test download returns the output path."""
+    model = OSNeuralSparseDocV3GTE()
+    output_path = tmp_path / "model_output"
+
+    result = model.download(output_path)
+
+    assert result == output_path
+    assert isinstance(result, Path)
+
+
+def test_patch_downloads_alibaba_model(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
+):
+    """Test patch method downloads Alibaba-NLP/new-impl."""
+    model = OSNeuralSparseDocV3GTE()
+    model_temp_path = tmp_path / "temp_model"
+    model_temp_path.mkdir()
+    (model_temp_path / "config.json").write_text('{"model_type": "test"}')
+
+    model._patch_local_model_with_alibaba_new_impl(model_temp_path)
+
+    assert (model_temp_path / "modeling.py").exists()
+    assert (model_temp_path / "configuration.py").exists()
+
+
+def test_patch_copies_files(neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path):
+    """Test patch copies modeling.py and configuration.py."""
+    model = OSNeuralSparseDocV3GTE()
+    model_temp_path = tmp_path / "temp_model"
+    model_temp_path.mkdir()
+    (model_temp_path / "config.json").write_text('{"model_type": "test"}')
+
+    model._patch_local_model_with_alibaba_new_impl(model_temp_path)
+
+    modeling_content = (model_temp_path / "modeling.py").read_text()
+    config_content = (model_temp_path / "configuration.py").read_text()
+
+    assert "Alibaba modeling code" in modeling_content
+    assert "Alibaba configuration code" in config_content
+
+
+def test_patch_updates_config_json(
+    neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
+):
+    """Test patch updates auto_map in config.json."""
+    model = OSNeuralSparseDocV3GTE()
+    model_temp_path = tmp_path / "temp_model"
+    model_temp_path.mkdir()
+    initial_config = {"model_type": "test", "vocab_size": 30000}
+    (model_temp_path / "config.json").write_text(json.dumps(initial_config))
+
+    model._patch_local_model_with_alibaba_new_impl(model_temp_path)
+
+    updated_config = json.loads((model_temp_path / "config.json").read_text())
+
+    assert "auto_map" in updated_config
+    assert updated_config["auto_map"]["AutoConfig"] == "configuration.NewConfig"
+    assert updated_config["auto_map"]["AutoModel"] == "modeling.NewModel"
+    assert updated_config["auto_map"]["AutoModelForMaskedLM"] == "modeling.NewForMaskedLM"
+
+
+def test_load_success(
+    neural_sparse_doc_v3_gte_fake_model_directory,
+    neural_sparse_doc_v3_gte_mock_transformers_models,
+):
+    """Test successful load from local path."""
+    model = OSNeuralSparseDocV3GTE()
+
+    model.load(neural_sparse_doc_v3_gte_fake_model_directory)
+
+    assert model._model is not None
+    assert model._tokenizer is not None
+
+
+def test_load_file_not_found():
+    """Test load raises FileNotFoundError for missing path."""
+    model = OSNeuralSparseDocV3GTE()
+    nonexistent_path = Path("/nonexistent/path")
+
+    with pytest.raises(FileNotFoundError, match="Model not found at path"):
+        model.load(nonexistent_path)
+
+
+def test_load_initializes_model_and_tokenizer(
+    neural_sparse_doc_v3_gte_fake_model_directory,
+    neural_sparse_doc_v3_gte_mock_transformers_models,
+):
+    """Test load initializes _model and _tokenizer attributes."""
+    model = OSNeuralSparseDocV3GTE()
+
+    assert model._model is None
+    assert model._tokenizer is None
+
+    model.load(neural_sparse_doc_v3_gte_fake_model_directory)
+
+    assert model._model is not None
+    assert model._tokenizer is not None
+
+
+def test_load_sets_up_special_token_ids(
+    neural_sparse_doc_v3_gte_fake_model_directory,
+    neural_sparse_doc_v3_gte_mock_transformers_models,
+):
+    """Test load sets up _special_token_ids list."""
+    model = OSNeuralSparseDocV3GTE()
+
+    model.load(neural_sparse_doc_v3_gte_fake_model_directory)
+
+    assert model._special_token_ids is not None
+    assert isinstance(model._special_token_ids, list)
+    assert len(model._special_token_ids) == 3  # CLS, SEP, PAD
+    assert 0 in model._special_token_ids  # [CLS] token id
+    assert 1 in model._special_token_ids  # [SEP] token id
+    assert 2 in model._special_token_ids  # [PAD] token id
+
+
+def test_load_sets_up_id_to_token_mapping(
+    neural_sparse_doc_v3_gte_fake_model_directory,
+    neural_sparse_doc_v3_gte_mock_transformers_models,
+):
+    """Test load creates _id_to_token mapping correctly."""
+    model = OSNeuralSparseDocV3GTE()
+
+    model.load(neural_sparse_doc_v3_gte_fake_model_directory)
+
+    assert model._id_to_token is not None
+    assert isinstance(model._id_to_token, list)
+    assert len(model._id_to_token) == 5  # vocab_size from mock
+    assert model._id_to_token[0] == "[CLS]"
+    assert model._id_to_token[1] == "[SEP]"
+    assert model._id_to_token[2] == "[PAD]"
+    assert model._id_to_token[3] == "word1"
+    assert model._id_to_token[4] == "word2"