Skip to content

Commit 8899d51

Browse files
committed
Testing for OSNeuralSparseDocV3GTE download and load
1 parent 80be7fe commit 8899d51

File tree

2 files changed

+341
-0
lines changed

2 files changed

+341
-0
lines changed

tests/conftest.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import json
2+
import logging
13
import zipfile
24
from pathlib import Path
35

@@ -6,6 +8,8 @@
68

79
from embeddings.models.base import BaseEmbeddingModel
810

11+
logger = logging.getLogger(__name__)
12+
913

1014
@pytest.fixture(autouse=True)
1115
def _test_env(monkeypatch):
@@ -32,8 +36,125 @@ def download(self, output_path: Path) -> Path:
3236
zf.writestr("tokenizer.json", '{"version": "1.0"}')
3337
return output_path
3438

39+
def load(self, model_path: str | Path) -> None: # noqa: ARG002
40+
logger.info("Model loaded successfully, 1.5s")
41+
3542

3643
@pytest.fixture
3744
def mock_model():
3845
"""Fixture providing a MockEmbeddingModel instance."""
3946
return MockEmbeddingModel()
47+
48+
49+
@pytest.fixture
50+
def neural_sparse_doc_v3_gte_fake_model_directory(tmp_path):
51+
"""Create a fake downloaded model directory with required files."""
52+
model_dir = tmp_path / "fake_model"
53+
model_dir.mkdir()
54+
55+
# create config.json
56+
config_json = {
57+
"model_type": "distilbert",
58+
"vocab_size": 30000,
59+
"auto_map": {
60+
"AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
61+
"AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
62+
},
63+
}
64+
(model_dir / "config.json").write_text(json.dumps(config_json))
65+
66+
# create modeling.py and configuration.py
67+
(model_dir / "modeling.py").write_text("# mock modeling code")
68+
(model_dir / "configuration.py").write_text("# mock configuration code")
69+
70+
# create tokenizer files
71+
(model_dir / "tokenizer.json").write_text('{"version": "1.0"}')
72+
(model_dir / "vocab.txt").write_text("word1\nword2\n")
73+
74+
return model_dir
75+
76+
77+
@pytest.fixture
78+
def neural_sparse_doc_v3_gte_mock_huggingface_snapshot(monkeypatch, tmp_path):
79+
"""Mock snapshot_download to create fake model files locally."""
80+
81+
def mock_snapshot(repo_id, local_dir, **kwargs):
82+
"""Create fake model files based on repo_id."""
83+
local_path = Path(local_dir)
84+
local_path.mkdir(parents=True, exist_ok=True)
85+
86+
if repo_id == "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte":
87+
# create main model files
88+
config_json = {
89+
"model_type": "distilbert",
90+
"vocab_size": 30000,
91+
"auto_map": {
92+
"AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
93+
"AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
94+
},
95+
}
96+
(local_path / "config.json").write_text(json.dumps(config_json))
97+
(local_path / "pytorch_model.bin").write_bytes(b"fake weights")
98+
(local_path / "tokenizer.json").write_text('{"version": "1.0"}')
99+
100+
elif repo_id == "Alibaba-NLP/new-impl":
101+
# create alibaba dependency files
102+
(local_path / "modeling.py").write_text("# Alibaba modeling code")
103+
(local_path / "configuration.py").write_text("# Alibaba configuration code")
104+
105+
return str(local_path)
106+
107+
monkeypatch.setattr(
108+
"embeddings.models.os_neural_sparse_doc_v3_gte.snapshot_download", mock_snapshot
109+
)
110+
return mock_snapshot
111+
112+
113+
@pytest.fixture
114+
def neural_sparse_doc_v3_gte_mock_transformers_models(monkeypatch):
115+
"""Mock AutoModelForMaskedLM and AutoTokenizer."""
116+
117+
class MockTokenizer:
118+
"""Mock tokenizer with necessary attributes."""
119+
120+
def __init__(self, *args, **kwargs): # noqa: ARG002
121+
self.vocab = {
122+
"[CLS]": 0,
123+
"[SEP]": 1,
124+
"[PAD]": 2,
125+
"word1": 3,
126+
"word2": 4,
127+
}
128+
self.vocab_size = len(self.vocab)
129+
self.special_tokens_map = {
130+
"cls_token": "[CLS]",
131+
"sep_token": "[SEP]",
132+
"pad_token": "[PAD]",
133+
}
134+
135+
class MockModel:
136+
"""Mock model with necessary attributes."""
137+
138+
def __init__(self, *args, **kwargs): # noqa: ARG002
139+
self.config = {"vocab_size": 30000}
140+
141+
class MockAutoTokenizer:
142+
@staticmethod
143+
def from_pretrained(*args, **kwargs): # noqa: ARG004
144+
return MockTokenizer()
145+
146+
class MockAutoModelForMaskedLM:
147+
@staticmethod
148+
def from_pretrained(*args, **kwargs): # noqa: ARG004
149+
return MockModel()
150+
151+
monkeypatch.setattr(
152+
"embeddings.models.os_neural_sparse_doc_v3_gte.AutoTokenizer",
153+
MockAutoTokenizer,
154+
)
155+
monkeypatch.setattr(
156+
"embeddings.models.os_neural_sparse_doc_v3_gte.AutoModelForMaskedLM",
157+
MockAutoModelForMaskedLM,
158+
)
159+
160+
return {"tokenizer": MockTokenizer, "model": MockModel}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
"""Tests for OSNeuralSparseDocV3GTE embedding model."""
2+
3+
# ruff: noqa: SLF001, PLR2004
4+
5+
import json
6+
from pathlib import Path
7+
8+
import pytest
9+
10+
from embeddings.models.os_neural_sparse_doc_v3_gte import OSNeuralSparseDocV3GTE
11+
12+
13+
def test_init():
14+
"""Test model initialization."""
15+
model = OSNeuralSparseDocV3GTE()
16+
assert model._model is None
17+
assert model._tokenizer is None
18+
assert model._special_token_ids is None
19+
assert model._id_to_token is None
20+
21+
22+
def test_model_uri():
23+
"""Test model_uri property returns correct HuggingFace URI."""
24+
model = OSNeuralSparseDocV3GTE()
25+
assert (
26+
model.model_uri
27+
== "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
28+
)
29+
assert (
30+
model.MODEL_URI
31+
== "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
32+
)
33+
34+
35+
def test_download_to_directory(
36+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
37+
):
38+
"""Test download to directory (not zip)."""
39+
model = OSNeuralSparseDocV3GTE()
40+
output_path = tmp_path / "model_output"
41+
42+
result = model.download(output_path)
43+
44+
assert result == output_path
45+
assert output_path.exists()
46+
assert (output_path / "config.json").exists()
47+
assert (output_path / "pytorch_model.bin").exists()
48+
assert (output_path / "tokenizer.json").exists()
49+
50+
51+
def test_download_to_zip_file(
52+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
53+
):
54+
"""Test download creates zip when path ends in .zip."""
55+
model = OSNeuralSparseDocV3GTE()
56+
output_path = tmp_path / "model.zip"
57+
58+
result = model.download(output_path)
59+
60+
assert result == output_path
61+
assert output_path.exists()
62+
assert output_path.suffix == ".zip"
63+
64+
65+
def test_download_calls_patch_method(
66+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path, monkeypatch
67+
):
68+
"""Test that download calls the Alibaba patching method."""
69+
model = OSNeuralSparseDocV3GTE()
70+
output_path = tmp_path / "model_output"
71+
72+
patch_called = False
73+
74+
def mock_patch(temp_path):
75+
nonlocal patch_called
76+
patch_called = True
77+
78+
monkeypatch.setattr(model, "_patch_local_model_with_alibaba_new_impl", mock_patch)
79+
80+
model.download(output_path)
81+
82+
assert patch_called
83+
84+
85+
def test_download_returns_path(
86+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
87+
):
88+
"""Test download returns the output path."""
89+
model = OSNeuralSparseDocV3GTE()
90+
output_path = tmp_path / "model_output"
91+
92+
result = model.download(output_path)
93+
94+
assert result == output_path
95+
assert isinstance(result, Path)
96+
97+
98+
def test_patch_downloads_alibaba_model(
99+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
100+
):
101+
"""Test patch method downloads Alibaba-NLP/new-impl."""
102+
model = OSNeuralSparseDocV3GTE()
103+
model_temp_path = tmp_path / "temp_model"
104+
model_temp_path.mkdir()
105+
(model_temp_path / "config.json").write_text('{"model_type": "test"}')
106+
107+
model._patch_local_model_with_alibaba_new_impl(model_temp_path)
108+
109+
assert (model_temp_path / "modeling.py").exists()
110+
assert (model_temp_path / "configuration.py").exists()
111+
112+
113+
def test_patch_copies_files(neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path):
114+
"""Test patch copies modeling.py and configuration.py."""
115+
model = OSNeuralSparseDocV3GTE()
116+
model_temp_path = tmp_path / "temp_model"
117+
model_temp_path.mkdir()
118+
(model_temp_path / "config.json").write_text('{"model_type": "test"}')
119+
120+
model._patch_local_model_with_alibaba_new_impl(model_temp_path)
121+
122+
modeling_content = (model_temp_path / "modeling.py").read_text()
123+
config_content = (model_temp_path / "configuration.py").read_text()
124+
125+
assert "Alibaba modeling code" in modeling_content
126+
assert "Alibaba configuration code" in config_content
127+
128+
129+
def test_patch_updates_config_json(
130+
neural_sparse_doc_v3_gte_mock_huggingface_snapshot, tmp_path
131+
):
132+
"""Test patch updates auto_map in config.json."""
133+
model = OSNeuralSparseDocV3GTE()
134+
model_temp_path = tmp_path / "temp_model"
135+
model_temp_path.mkdir()
136+
initial_config = {"model_type": "test", "vocab_size": 30000}
137+
(model_temp_path / "config.json").write_text(json.dumps(initial_config))
138+
139+
model._patch_local_model_with_alibaba_new_impl(model_temp_path)
140+
141+
updated_config = json.loads((model_temp_path / "config.json").read_text())
142+
143+
assert "auto_map" in updated_config
144+
assert updated_config["auto_map"]["AutoConfig"] == "configuration.NewConfig"
145+
assert updated_config["auto_map"]["AutoModel"] == "modeling.NewModel"
146+
assert updated_config["auto_map"]["AutoModelForMaskedLM"] == "modeling.NewForMaskedLM"
147+
148+
149+
def test_load_success(
150+
neural_sparse_doc_v3_gte_fake_model_directory,
151+
neural_sparse_doc_v3_gte_mock_transformers_models,
152+
):
153+
"""Test successful load from local path."""
154+
model = OSNeuralSparseDocV3GTE()
155+
156+
model.load(neural_sparse_doc_v3_gte_fake_model_directory)
157+
158+
assert model._model is not None
159+
assert model._tokenizer is not None
160+
161+
162+
def test_load_file_not_found():
163+
"""Test load raises FileNotFoundError for missing path."""
164+
model = OSNeuralSparseDocV3GTE()
165+
nonexistent_path = Path("/nonexistent/path")
166+
167+
with pytest.raises(FileNotFoundError, match="Model not found at path"):
168+
model.load(nonexistent_path)
169+
170+
171+
def test_load_initializes_model_and_tokenizer(
172+
neural_sparse_doc_v3_gte_fake_model_directory,
173+
neural_sparse_doc_v3_gte_mock_transformers_models,
174+
):
175+
"""Test load initializes _model and _tokenizer attributes."""
176+
model = OSNeuralSparseDocV3GTE()
177+
178+
assert model._model is None
179+
assert model._tokenizer is None
180+
181+
model.load(neural_sparse_doc_v3_gte_fake_model_directory)
182+
183+
assert model._model is not None
184+
assert model._tokenizer is not None
185+
186+
187+
def test_load_sets_up_special_token_ids(
188+
neural_sparse_doc_v3_gte_fake_model_directory,
189+
neural_sparse_doc_v3_gte_mock_transformers_models,
190+
):
191+
"""Test load sets up _special_token_ids list."""
192+
model = OSNeuralSparseDocV3GTE()
193+
194+
model.load(neural_sparse_doc_v3_gte_fake_model_directory)
195+
196+
assert model._special_token_ids is not None
197+
assert isinstance(model._special_token_ids, list)
198+
assert len(model._special_token_ids) == 3 # CLS, SEP, PAD
199+
assert 0 in model._special_token_ids # [CLS] token id
200+
assert 1 in model._special_token_ids # [SEP] token id
201+
assert 2 in model._special_token_ids # [PAD] token id
202+
203+
204+
def test_load_sets_up_id_to_token_mapping(
205+
neural_sparse_doc_v3_gte_fake_model_directory,
206+
neural_sparse_doc_v3_gte_mock_transformers_models,
207+
):
208+
"""Test load creates _id_to_token mapping correctly."""
209+
model = OSNeuralSparseDocV3GTE()
210+
211+
model.load(neural_sparse_doc_v3_gte_fake_model_directory)
212+
213+
assert model._id_to_token is not None
214+
assert isinstance(model._id_to_token, list)
215+
assert len(model._id_to_token) == 5 # vocab_size from mock
216+
assert model._id_to_token[0] == "[CLS]"
217+
assert model._id_to_token[1] == "[SEP]"
218+
assert model._id_to_token[2] == "[PAD]"
219+
assert model._id_to_token[3] == "word1"
220+
assert model._id_to_token[4] == "word2"

0 commit comments

Comments
 (0)