From ff16285c2daf6095565ba46100cb1f3832a451b1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 14:33:43 +0100 Subject: [PATCH 01/17] CU-869an5f00: Update backend registration tests - use mocking --- medcat-den/tests/test_backend_registration.py | 36 ++----------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/medcat-den/tests/test_backend_registration.py b/medcat-den/tests/test_backend_registration.py index d2e36aa4..9ff2ea44 100644 --- a/medcat-den/tests/test_backend_registration.py +++ b/medcat-den/tests/test_backend_registration.py @@ -1,43 +1,11 @@ import pytest +from unittest.mock import MagicMock -from medcat.cat import CAT - -from medcat_den.base import ModelInfo -from medcat_den.wrappers import CATWrapper from medcat_den.backend import DenType, _remote_den_map, register_remote_den from medcat_den.resolver import resolve -class FakeDen: - def __init__(self, **kwargs): - return - - @property - def den_type(self) -> DenType: - return DenType.MEDCATTERY - - def list_available_models(self) -> list[ModelInfo]: - return [] - - def list_available_base_models(self) -> list[ModelInfo]: - return [] - - def list_available_derivative_models(self, model: ModelInfo - ) -> list[ModelInfo]: - return [] - - def fetch_model(self, model_info: ModelInfo) -> CATWrapper: - return - - def push_model(self, cat: CAT, description: str) -> None: - return - - def _push_model_from_file(self, file_path: str, description: str) -> None: - return - - def delete_model(self, model_info: ModelInfo, - allow_delete_base_models: bool = False) -> None: - return +FakeDen = MagicMock @pytest.fixture() From 5aac155b268fca1ef0eca7a8c322e0465b2db453 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 14:34:09 +0100 Subject: [PATCH 02/17] CU-869an5f00: Add new optional API for den to fine tune models --- medcat-den/src/medcat_den/den.py | 28 ++++++++++++++++++- .../src/medcat_den/den_impl/file_den.py | 13 +++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/medcat-den/src/medcat_den/den.py b/medcat-den/src/medcat_den/den.py index 4734b9f0..254dc247 100644 --- a/medcat-den/src/medcat_den/den.py +++ b/medcat-den/src/medcat_den/den.py @@ -1,6 +1,7 @@ -from typing import Protocol, Optional, runtime_checkable +from typing import Protocol, Optional, runtime_checkable, Union from medcat.cat import CAT +from medcat.data.mctexport import MedCATTrainerExport from medcat_den.base import ModelInfo from medcat_den.wrappers import CATWrapper @@ -102,6 +103,31 @@ def delete_model(self, model_info: ModelInfo, """ pass + def finetune_model(self, model_info: ModelInfo, + data: Union[list[str], MedCATTrainerExport] + ) -> ModelInfo: + """Finetune the model on the remote den. + + This is an optional API that is (generally) only available + for remote dens. The idea is that the data is sent to the remote + den and the finetuning is done on the remote. + + Args: + model_info (ModelInfo): The model info + data (Union[list[str], MedCATTrainerExport]): The list of project + ids (already on remote) or the trainer export to train on. + + Returns: + ModelInfo: The resulting model. + + Raises: + UnsupportedAPIException: If the den does not support this API. + """ + + +class UnsupportedAPIException(ValueError): + pass + def get_default_den( type_: Optional[DenType] = None, diff --git a/medcat-den/src/medcat_den/den_impl/file_den.py b/medcat-den/src/medcat_den/den_impl/file_den.py index a7ea7ce8..43826dd2 100644 --- a/medcat-den/src/medcat_den/den_impl/file_den.py +++ b/medcat-den/src/medcat_den/den_impl/file_den.py @@ -1,4 +1,4 @@ -from typing import Optional, cast, Any +from typing import Optional, cast, Any, Union import json from datetime import datetime @@ -7,8 +7,10 @@ import shutil from medcat.cat import CAT +from medcat.data.mctexport import MedCATTrainerExport -from medcat_den.den import Den, DuplicateModelException +from medcat_den.den import ( + Den, DuplicateModelException, UnsupportedAPIException) from medcat_den.backend import DenType from medcat_den.base import ModelInfo from medcat_den.wrappers import CATWrapper @@ -220,3 +222,10 @@ def delete_model(self, model_info: ModelInfo, folder_path = zip_path.removesuffix(".zip") if os.path.exists(folder_path): shutil.rmtree(folder_path) + + def finetune_model(self, model_info: ModelInfo, + data: Union[list[str], MedCATTrainerExport]): + raise UnsupportedAPIException( + "Local den does not support finetuning on the den. " + "Use a remote den instead or perform training locally." + ) From 8a7039776e3f20e84d96a5f321f154f599399cf8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 14:51:39 +0100 Subject: [PATCH 03/17] CU-869an5f00: Add eval API --- medcat-den/src/medcat_den/den.py | 24 +++++++++++++++++++ .../src/medcat_den/den_impl/file_den.py | 7 ++++++ 2 files changed, 31 insertions(+) diff --git a/medcat-den/src/medcat_den/den.py b/medcat-den/src/medcat_den/den.py index 254dc247..8a9dde87 100644 --- a/medcat-den/src/medcat_den/den.py +++ b/medcat-den/src/medcat_den/den.py @@ -112,6 +112,9 @@ def finetune_model(self, model_info: ModelInfo, for remote dens. The idea is that the data is sent to the remote den and the finetuning is done on the remote. + If raw data is given, unless already present remotely, it will be + uploaded to the remote den. + Args: model_info (ModelInfo): The model info data (Union[list[str], MedCATTrainerExport]): The list of project @@ -124,6 +127,27 @@ def finetune_model(self, model_info: ModelInfo, UnsupportedAPIException: If the den does not support this API. """ + def evaluate_model(self, model_info: ModelInfo, + data: Union[list[str], MedCATTrainerExport]) -> dict: + """Evaluate model on remote den. + + This is an optional API that is (generally) only available + for remote dens. The idea is that the data is sent to the remote + den and the metrics are gathered on the remote. + + If raw data is given, unless already present remotely, it will be + uploaded to the remote den. + + Args: + model_info (ModelInfo): The model info. + data (Union[list[str], MedCATTrainerExport]): The list of project + ids (already on remote) or the trainer export to train on. + + Returns: + dict: The resulting metrics. + """ + pass + class UnsupportedAPIException(ValueError): pass diff --git a/medcat-den/src/medcat_den/den_impl/file_den.py b/medcat-den/src/medcat_den/den_impl/file_den.py index 43826dd2..2aff5354 100644 --- a/medcat-den/src/medcat_den/den_impl/file_den.py +++ b/medcat-den/src/medcat_den/den_impl/file_den.py @@ -229,3 +229,10 @@ def finetune_model(self, model_info: ModelInfo, "Local den does not support finetuning on the den. " "Use a remote den instead or perform training locally." ) + + def evaluate_model(self, model_info: ModelInfo, + data: Union[list[str], MedCATTrainerExport]) -> dict: + raise UnsupportedAPIException( + "Local den does not support evaluation on the den. " + "Use a remote den instead or perform evaluation locally." + ) From d5a57dfe6feb3ee7f6ac16ffcb6aa77eb9ff70b5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:15:49 +0100 Subject: [PATCH 04/17] CU-869an5f00: Add option to disallow local fine-tune and/or push of fine-tuned models --- medcat-den/src/medcat_den/config.py | 2 + .../src/medcat_den/resolver/resolver.py | 42 +++++++++++++------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/medcat-den/src/medcat_den/config.py b/medcat-den/src/medcat_den/config.py index 56927f83..b1d7ebc6 100644 --- a/medcat-den/src/medcat_den/config.py +++ b/medcat-den/src/medcat_den/config.py @@ -17,6 +17,8 @@ class LocalDenConfig(DenConfig): class RemoteDenConfig(DenConfig): host: str credentials: dict + allow_local_fine_tune: bool + allow_push_fine_tuned: bool class LocalCacheConfig(BaseModel): diff --git a/medcat-den/src/medcat_den/resolver/resolver.py b/medcat-den/src/medcat_den/resolver/resolver.py index 58342f6b..bbafa901 100644 --- a/medcat-den/src/medcat_den/resolver/resolver.py +++ b/medcat-den/src/medcat_den/resolver/resolver.py @@ -38,6 +38,12 @@ MEDCAT_DEN_LOCAL_CACHE_MAX_SIZE = "MEDCAT_DEN_LOCAL_CACHE_MAX_SIZE" MEDCAT_DEN_LOCAL_CACHE_EVICTION_POLICY = ( "MEDCAT_DEN_LOCAL_CACHE_EVICTION_POLICY") +MEDCAT_DEN_REMOTE_ALLOW_LOCAL_FINE_TUNE = ( + "MEDCAT_DEN_REMOTE_ALLOW_LOCAL_FINE_TUNE") +MEDCAT_DEN_REMOTE_ALLOW_PUSH_FINETUNED = ( + "MEDCAT_DEN_REMOTE_ALLOW_PUSH_FINETUNED") + +ALLOW_OPTION_LOWERCASE = ("true", "yes", "1", "y") def is_writable(path: str, propgate: bool = True) -> bool: @@ -52,7 +58,10 @@ def _init_den_cnf( type_: Optional[DenType] = None, location: Optional[str] = None, host: Optional[str] = None, - credentials: Optional[dict] = None,) -> DenConfig: + credentials: Optional[dict] = None, + remote_allow_local_fine_tune: Optional[str] = None, + remote_allow_push_fine_tuned: Optional[str] = None, + ) -> DenConfig: # Priority: args > env > defaults type_in = ( type_ @@ -82,13 +91,25 @@ def _init_den_cnf( den_cnf = LocalDenConfig(type=type_final, location=location_final) else: + host = host or os.getenv(MEDCAT_DEN_REMOTE_HOST) if not host: raise ValueError("Need to specify a host for remote den") if not credentials: raise ValueError("Need to specify credentials for remote den") - den_cnf = RemoteDenConfig(type=type_final, - host=host, - credentials=credentials) + allow_local_fine_tune = str( + remote_allow_local_fine_tune or + os.getenv(MEDCAT_DEN_REMOTE_ALLOW_LOCAL_FINE_TUNE) + ).lower() in ALLOW_OPTION_LOWERCASE + allow_push_fine_tuned = str( + remote_allow_push_fine_tuned or + os.getenv(MEDCAT_DEN_REMOTE_ALLOW_PUSH_FINETUNED) + ).lower() in ALLOW_OPTION_LOWERCASE + den_cnf = RemoteDenConfig( + type=type_final, + host=host, + credentials=credentials, + allow_local_fine_tune=allow_local_fine_tune, + allow_push_fine_tuned=allow_push_fine_tuned) return den_cnf @@ -101,8 +122,12 @@ def resolve( expiration_time: Optional[int] = None, max_size: Optional[int] = None, eviction_policy: Optional[str] = None, + remote_allow_local_fine_tune: Optional[str] = None, + remote_allow_push_fine_tuned: Optional[str] = None, ) -> Den: - den_cnf = _init_den_cnf(type_, location, host, credentials) + den_cnf = _init_den_cnf(type_, location, host, credentials, + remote_allow_local_fine_tune, + remote_allow_push_fine_tuned) den = resolve_from_config(den_cnf) lc_cnf = _init_lc_cnf( local_cache_path, expiration_time, max_size, eviction_policy) @@ -126,13 +151,6 @@ def _resolve_local(config: LocalDenConfig) -> LocalFileDen: def resolve_from_config(config: DenConfig) -> Den: if isinstance(config, LocalDenConfig): return _resolve_local(config) - # TODO: support remote (e) - # elif type_final == DenType.MEDCATTERY: - # host = host or os.getenv(MEDCAT_DEN_REMOTE_HOST) - # if host is None: - # raise ValueError("Remote DEN requires a host address") - # # later you’d plug in MedcatteryRemoteDen, MLFlowDen, etc. - # return MedCATteryDen(host=host, credentials=credentials) elif has_registered_remote_den(config.type): den_cls = get_registered_remote_den(config.type) den = den_cls(cnf=config) From 30f7f1223d3616f68ad7a4ebec4c292473de6463 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:26:10 +0100 Subject: [PATCH 05/17] CU-869an5f00: Allow a DenWrapper to have a den config alongside it --- medcat-den/src/medcat_den/den_impl/file_den.py | 3 ++- medcat-den/src/medcat_den/wrappers.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/medcat-den/src/medcat_den/den_impl/file_den.py b/medcat-den/src/medcat_den/den_impl/file_den.py index 2aff5354..109ebe6b 100644 --- a/medcat-den/src/medcat_den/den_impl/file_den.py +++ b/medcat-den/src/medcat_den/den_impl/file_den.py @@ -164,7 +164,8 @@ def fetch_model(self, model_info: ModelInfo) -> CATWrapper: model_path = self._get_model_zip_path(model_info) return cast( CATWrapper, - CATWrapper.load_model_pack(model_path, model_info=model_info)) + CATWrapper.load_model_pack(model_path, model_info=model_info, + den_cnf=self._cnf)) def push_model(self, cat: CAT, description: str) -> None: if isinstance(cat, CATWrapper): diff --git a/medcat-den/src/medcat_den/wrappers.py b/medcat-den/src/medcat_den/wrappers.py index c3f00804..ba6d2fd3 100644 --- a/medcat-den/src/medcat_den/wrappers.py +++ b/medcat-den/src/medcat_den/wrappers.py @@ -5,6 +5,7 @@ from medcat.storage.serialisers import AvailableSerialisers from medcat_den.base import ModelInfo +from medcat_den.config import DenConfig class CATWrapper(CAT): @@ -20,6 +21,7 @@ class CATWrapper(CAT): """ _model_info: ModelInfo + _den_cnf: DenConfig def save_model_pack( self, target_folder: str, pack_name: str = DEFAULT_PACK_NAME, @@ -63,10 +65,11 @@ def load_model_pack(cls, model_pack_path: str, config_dict: Optional[dict] = None, addon_config_dict: Optional[dict[str, dict]] = None, model_info: Optional[ModelInfo] = None, + den_cnf: Optional[DenConfig] = None, ) -> 'CAT': """Load the model pack from file. - This also + This may also disallow model load from disk in certain secnarios. Args: model_pack_path (str): The model pack path. @@ -80,6 +83,9 @@ def load_model_pack(cls, model_pack_path: str, model_inof (Optional[ModelInfo]): The base model info based on which the model was originally fetched. Should not be left None. + den_cnf: (Optional[DenConfig]): The config for the den being + used. Should not be left None. + Raises: ValueError: If the saved data does not represent a model pack. @@ -95,7 +101,10 @@ def load_model_pack(cls, model_pack_path: str, cat.__class__ = CATWrapper if model_info is None: raise CannotWrapModel("Model info must be provided") + if den_cnf is None: + raise CannotWrapModel("den_cnf must be provided") cat._model_info = model_info + cat._den_cnf = den_cnf return cat From e3ced4a07a53c2178c4380d4f567cc8a160d2902 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:30:40 +0100 Subject: [PATCH 06/17] CU-869an5f00: Disallow local fine tune with remote dens if/when applicable --- medcat-den/src/medcat_den/wrappers.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/medcat-den/src/medcat_den/wrappers.py b/medcat-den/src/medcat_den/wrappers.py index ba6d2fd3..9964734f 100644 --- a/medcat-den/src/medcat_den/wrappers.py +++ b/medcat-den/src/medcat_den/wrappers.py @@ -5,7 +5,7 @@ from medcat.storage.serialisers import AvailableSerialisers from medcat_den.base import ModelInfo -from medcat_den.config import DenConfig +from medcat_den.config import DenConfig, RemoteDenConfig class CATWrapper(CAT): @@ -56,6 +56,16 @@ def save_model_pack( if not force_save_local and not is_injected_for_save(): raise CannotSaveOnDiskException( f"Cannot save model on disk: {CATWrapper.__doc__}") + if (is_injected_for_save() and isinstance( + self._den_cnf, RemoteDenConfig) and + not self._den_cnf.allow_push_fine_tuned): + raise CannotSaveOnDiskException( + "Cannot save fine-tuned model onto a remote den." + "In order to make full use of the remote den capabilities, " + "use the den API to fine tune a model directly on the den. " + "See `Den.finetune_model` for details or set the config " + "option of `allow_push_fine_tuned` to True" + ) return super().save_model_pack( target_folder, pack_name, serialiser_type, make_archive, only_archive, add_hash_to_pack_name, change_description) From eaa435ee8d85e320375aeab1cd45f490bc049248 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:48:41 +0100 Subject: [PATCH 07/17] CU-869an5f00: Separate local and remote exceptions --- medcat-den/src/medcat_den/wrappers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/medcat-den/src/medcat_den/wrappers.py b/medcat-den/src/medcat_den/wrappers.py index 9964734f..e4ee370f 100644 --- a/medcat-den/src/medcat_den/wrappers.py +++ b/medcat-den/src/medcat_den/wrappers.py @@ -59,7 +59,7 @@ def save_model_pack( if (is_injected_for_save() and isinstance( self._den_cnf, RemoteDenConfig) and not self._den_cnf.allow_push_fine_tuned): - raise CannotSaveOnDiskException( + raise CannotSendToRemoteException( "Cannot save fine-tuned model onto a remote den." "In order to make full use of the remote den capabilities, " "use the den API to fine tune a model directly on the den. " @@ -128,3 +128,9 @@ class CannotSaveOnDiskException(ValueError): def __init__(self, *args): super().__init__(*args) + + +class CannotSendToRemoteException(ValueError): + + def __call__(self, *args): + return super().__call__(*args) From 2e2060d93ffcf6728733f33dd4083e16473ce0cb Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:55:43 +0100 Subject: [PATCH 08/17] CU-869an5f00: Add a few tests for disallowing push --- medcat-den/tests/test_remote_den_disallows.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 medcat-den/tests/test_remote_den_disallows.py diff --git a/medcat-den/tests/test_remote_den_disallows.py b/medcat-den/tests/test_remote_den_disallows.py new file mode 100644 index 00000000..7a514908 --- /dev/null +++ b/medcat-den/tests/test_remote_den_disallows.py @@ -0,0 +1,109 @@ +from typing import cast +from medcat_den.config import RemoteDenConfig +from medcat_den.backend import DenType +from medcat_den.injection import injected_den +from medcat_den.wrappers import CATWrapper, CannotSendToRemoteException +from medcat_den.den import Den +from medcat_den.den_impl.file_den import LocalFileDen +from medcat_den.base import ModelInfo + +from medcat.cat import CAT + +import pytest + +from .test_file_system_den import def_model_pack, den, UNSUP_TRAIN_EXAMPLE # noqa + + +def get_wrapped_model_pack( + in_model_pack: CAT, cnf: RemoteDenConfig) -> CATWrapper: # noqa + # make it a wrapper + in_model_pack.__class__ = CATWrapper + model_pack = cast(CATWrapper, in_model_pack) + # set required stuff, mostly the config + model_pack._den_cnf = cnf + # set model info + model_pack._model_info = ModelInfo.from_model_pack(model_pack) + return model_pack + + +@pytest.fixture +def cnf_disallow_all(): + return RemoteDenConfig(type=DenType.MEDCATTERY, + host="ABC", + credentials={"A": "B"}, + allow_local_fine_tune=False, + allow_push_fine_tuned=False, + ) + + +@pytest.fixture +def cnf_allow_push_only(): + return RemoteDenConfig(type=DenType.MEDCATTERY, + host="ABC", + credentials={"A": "B"}, + allow_local_fine_tune=False, + allow_push_fine_tuned=True, + ) + + +@pytest.fixture +def cnf_allow_finetune_only(): + return RemoteDenConfig(type=DenType.MEDCATTERY, + host="ABC", + credentials={"A": "B"}, + allow_local_fine_tune=True, + allow_push_fine_tuned=False, + ) + + +@pytest.fixture +def cnf_allow_both(): + return RemoteDenConfig(type=DenType.MEDCATTERY, + host="ABC", + credentials={"A": "B"}, + allow_local_fine_tune=True, + allow_push_fine_tuned=True, + ) + + +@pytest.fixture +def den_disallow_all(den: LocalFileDen, cnf_disallow_all: RemoteDenConfig) -> Den: # noqa + # NOTE: local den with remote config + den._cnf = cnf_disallow_all + return den + + +@pytest.fixture +def den_allow_push_only(den: LocalFileDen, cnf_allow_push_only: RemoteDenConfig) -> Den: # noqa + # NOTE: local den with remote config + den._cnf = cnf_allow_push_only + return den + + +@pytest.fixture +def den_allow_finetune_only(den: LocalFileDen, cnf_allow_finetune_only: RemoteDenConfig) -> Den: # noqa + # NOTE: local den with remote config + den._cnf = cnf_allow_finetune_only + return den + + +def test_can_disallow_push_all(def_model_pack: CAT, den_disallow_all: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den_disallow_all._cnf) + with injected_den(lambda: den_disallow_all, inject_save=True): + # do some training + model_pack.trainer.train_unsupervised(UNSUP_TRAIN_EXAMPLE) + # attempt to save to den + with pytest.raises(CannotSendToRemoteException): + model_pack.save_model_pack("Did some fine-tuning") + + +def test_can_disallow_push_only(def_model_pack: CAT, den_allow_finetune_only: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den_allow_finetune_only._cnf) + with injected_den(lambda: den_allow_finetune_only, inject_save=True): + # do some training + model_pack.trainer.train_unsupervised(UNSUP_TRAIN_EXAMPLE) + # attempt to save to den + with pytest.raises(CannotSendToRemoteException): + model_pack.save_model_pack("Did some fine-tuning") From 42cb5b3e37f4f8e21c7b672df26b2a667ee5ee99 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 15:59:05 +0100 Subject: [PATCH 09/17] CU-869an5f00: Add another simple test --- medcat-den/tests/test_remote_den_disallows.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/medcat-den/tests/test_remote_den_disallows.py b/medcat-den/tests/test_remote_den_disallows.py index 7a514908..b09f7668 100644 --- a/medcat-den/tests/test_remote_den_disallows.py +++ b/medcat-den/tests/test_remote_den_disallows.py @@ -87,6 +87,17 @@ def den_allow_finetune_only(den: LocalFileDen, cnf_allow_finetune_only: RemoteDe return den +def test_can_normally_push(def_model_pack: CAT, den: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den._cnf) + with injected_den(lambda: den, inject_save=True): + # do some training + model_pack.trainer.train_unsupervised(UNSUP_TRAIN_EXAMPLE) + # should be able to just send to den + model_pack.save_model_pack("Did some fine-tuning") + + + def test_can_disallow_push_all(def_model_pack: CAT, den_disallow_all: LocalFileDen): # noqa model_pack = get_wrapped_model_pack( def_model_pack, den_disallow_all._cnf) From 7ecb08e57a8ebed346cfa6b90988894cfd2ce399 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:16:10 +0100 Subject: [PATCH 10/17] CU-869an5f00: Add wrapper for training to control disallowing supervised training --- medcat-den/src/medcat_den/wrappers.py | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/medcat-den/src/medcat_den/wrappers.py b/medcat-den/src/medcat_den/wrappers.py index e4ee370f..de86c29a 100644 --- a/medcat-den/src/medcat_den/wrappers.py +++ b/medcat-den/src/medcat_den/wrappers.py @@ -3,6 +3,8 @@ from medcat.cat import CAT from medcat.utils.defaults import DEFAULT_PACK_NAME from medcat.storage.serialisers import AvailableSerialisers +from medcat.trainer import Trainer +from medcat.data.mctexport import MedCATTrainerExport from medcat_den.base import ModelInfo from medcat_den.config import DenConfig, RemoteDenConfig @@ -70,6 +72,11 @@ def save_model_pack( target_folder, pack_name, serialiser_type, make_archive, only_archive, add_hash_to_pack_name, change_description) + @property + def trainer(self) -> Trainer: + tr = super().trainer + return WrappedTrainer(self._den_cnf, tr) + @classmethod def load_model_pack(cls, model_pack_path: str, config_dict: Optional[dict] = None, @@ -118,6 +125,38 @@ def load_model_pack(cls, model_pack_path: str, return cat +class WrappedTrainer(Trainer): + + def __init__(self, den_cnf: DenConfig, delegate: Trainer): + super().__init__(delegate.cdb, delegate.caller, delegate._pipeline) + self._den_cnf = den_cnf + + def train_supervised_raw( + self, data: MedCATTrainerExport, reset_cui_count: bool = False, + nepochs: int = 1, print_stats: int = 0, use_filters: bool = False, + terminate_last: bool = False, use_overlaps: bool = False, + use_cui_doc_limit: bool = False, test_size: float = 0, + devalue_others: bool = False, use_groups: bool = False, + never_terminate: bool = False, + train_from_false_positives: bool = False, + extra_cui_filter: Optional[set[str]] = None, + disable_progress: bool = False, train_addons: bool = False): + if (isinstance(self._den_cnf, RemoteDenConfig) and + not self._den_cnf.allow_local_fine_tune): + raise NotAllowedToFineTuneLocallyException( + "You are not allowed to fine-tune remote models locally. " + "Please use the `Den.finetune_model` method directly to " + "fine tune on the remote den, or if required, set the " + "`allow_local_fine_tune` config value to `True`." + ) + return super().train_supervised_raw( + data, reset_cui_count, nepochs, print_stats, use_filters, + terminate_last, use_overlaps, use_cui_doc_limit, test_size, + devalue_others, use_groups, never_terminate, + train_from_false_positives, extra_cui_filter, disable_progress, + train_addons) + + class CannotWrapModel(ValueError): def __init__(self, *args): @@ -134,3 +173,9 @@ class CannotSendToRemoteException(ValueError): def __call__(self, *args): return super().__call__(*args) + + +class NotAllowedToFineTuneLocallyException(ValueError): + + def __call__(self, *args): + return super().__call__(*args) From eb6073ae3ac434b1c2cce106078cf1507be0680f Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:16:33 +0100 Subject: [PATCH 11/17] CU-869an5f00: Add a few simple tests for disallowing local training with a remote den --- medcat-den/tests/test_remote_den_disallows.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/medcat-den/tests/test_remote_den_disallows.py b/medcat-den/tests/test_remote_den_disallows.py index b09f7668..567c0fcb 100644 --- a/medcat-den/tests/test_remote_den_disallows.py +++ b/medcat-den/tests/test_remote_den_disallows.py @@ -3,6 +3,7 @@ from medcat_den.backend import DenType from medcat_den.injection import injected_den from medcat_den.wrappers import CATWrapper, CannotSendToRemoteException +from medcat_den.wrappers import NotAllowedToFineTuneLocallyException from medcat_den.den import Den from medcat_den.den_impl.file_den import LocalFileDen from medcat_den.base import ModelInfo @@ -109,7 +110,7 @@ def test_can_disallow_push_all(def_model_pack: CAT, den_disallow_all: LocalFileD model_pack.save_model_pack("Did some fine-tuning") -def test_can_disallow_push_only(def_model_pack: CAT, den_allow_finetune_only: LocalFileDen): # noqa +def test_can_disallow_save_finetune_only(def_model_pack: CAT, den_allow_finetune_only: LocalFileDen): # noqa model_pack = get_wrapped_model_pack( def_model_pack, den_allow_finetune_only._cnf) with injected_den(lambda: den_allow_finetune_only, inject_save=True): @@ -118,3 +119,31 @@ def test_can_disallow_push_only(def_model_pack: CAT, den_allow_finetune_only: Lo # attempt to save to den with pytest.raises(CannotSendToRemoteException): model_pack.save_model_pack("Did some fine-tuning") + + + +def test_can_normally_fine_tune(def_model_pack: CAT, den: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den._cnf) + with injected_den(lambda: den, inject_save=True): + # should be able to just do some supervised training + model_pack.trainer.train_supervised_raw({"projects": []}) + + +def test_can_disallow_fine_tune_all(def_model_pack: CAT, den_disallow_all: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den_disallow_all._cnf) + with injected_den(lambda: den_disallow_all, inject_save=True): + # attempt to save do supervised training + with pytest.raises(NotAllowedToFineTuneLocallyException): + model_pack.trainer.train_supervised_raw({"projects": []}) + + +def test_can_disallow_fine_tune_push_only(def_model_pack: CAT, den_allow_push_only: LocalFileDen): # noqa + model_pack = get_wrapped_model_pack( + def_model_pack, den_allow_push_only._cnf) + with injected_den(lambda: den_allow_push_only, inject_save=True): + # attempt to save do supervised training + with pytest.raises(NotAllowedToFineTuneLocallyException): + model_pack.trainer.train_supervised_raw({"projects": []}) + model_pack.save_model_pack("Did some fine-tuning") From 12861c1aebae2ecc7f11e6d898196da728514ff9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:20:08 +0100 Subject: [PATCH 12/17] CU-869an5f00: Add new environmental variables to README --- medcat-den/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/medcat-den/README.md b/medcat-den/README.md index bab9013d..37d00670 100644 --- a/medcat-den/README.md +++ b/medcat-den/README.md @@ -133,5 +133,7 @@ However, there's a set of environmental variables that can be set in order to cu | MEDCAT_DEN_LOCAL_CACHE_EXPIRATION_TIME | int | The expriation time for local cache (in seconds) | The default is 10 days | | MEDCAT_DEN_LOCAL_CACHE_MAX_SIZE | int | The maximum size of the cache in bytes | The default is 100 GB | | MEDCAT_DEN_LOCAL_CACHE_EVICTION_POLICY | str | The eviction policy for the local cache | The default is LRU | +| MEDCAT_DEN_REMOTE_ALLOW_PUSH_FINETUNED | bool | Whether to allow locallly fine tuned model to be pushed to remote dens | Defaults to False | +| MEDCAT_DEN_REMOTE_ALLOW_LOCAL_FINE_TUNE | bool | Whether to allow local fine tuning for remote dens | Defaults to False | When creating a den, the resolver will use the explicitly passed values first, and if none are provided, it will default to the ones defined in the environmental variables. From 10c2831fccb685f0075342c1024aa91f36fcdfe3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:23:23 +0100 Subject: [PATCH 13/17] CU-869an5f00: Add a small note in code regarding defaults --- medcat-den/src/medcat_den/resolver/resolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/medcat-den/src/medcat_den/resolver/resolver.py b/medcat-den/src/medcat_den/resolver/resolver.py index bbafa901..325dc6c5 100644 --- a/medcat-den/src/medcat_den/resolver/resolver.py +++ b/medcat-den/src/medcat_den/resolver/resolver.py @@ -96,6 +96,8 @@ def _init_den_cnf( raise ValueError("Need to specify a host for remote den") if not credentials: raise ValueError("Need to specify credentials for remote den") + # NOTE: these will default to False when nothing is specified + # because "None" is not in ALLOW_OPTION_LOWERCASE allow_local_fine_tune = str( remote_allow_local_fine_tune or os.getenv(MEDCAT_DEN_REMOTE_ALLOW_LOCAL_FINE_TUNE) From 480197cc5db1e0c50635757857dfd8254229243e Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:31:08 +0100 Subject: [PATCH 14/17] CU-869an5f00: Propagate new options to get_default_den --- medcat-den/src/medcat_den/den.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/medcat-den/src/medcat_den/den.py b/medcat-den/src/medcat_den/den.py index 8a9dde87..eb91331c 100644 --- a/medcat-den/src/medcat_den/den.py +++ b/medcat-den/src/medcat_den/den.py @@ -162,6 +162,8 @@ def get_default_den( expiration_time: Optional[int] = None, max_size: Optional[int] = None, eviction_policy: Optional[str] = None, + remote_allow_local_fine_tune: Optional[str] = None, + remote_allow_push_fine_tuned: Optional[str] = None, ) -> Den: """Get the default den. @@ -187,6 +189,10 @@ def get_default_den( Policies avialable: LRU (`least-recently-used`), LRS (`least-recently-stored`), LFU (`least-frequently-used`), and `none` (disables evictions). + remote_allow_local_fine_tune (Optional[str]): Whether to allow local + fine tuning of remote models. + remote_allow_push_fine_tuned (Optional[str]): Whether to allow pushing + of locally fine-tuned models to the remote Returns: Den: The resolved den. @@ -194,7 +200,8 @@ def get_default_den( # NOTE: doing dynamic import to avoid circular imports from medcat_den.resolver import resolve return resolve(type_, location, host, credentials, local_cache_path, - expiration_time, max_size, eviction_policy) + expiration_time, max_size, eviction_policy, + remote_allow_local_fine_tune, remote_allow_push_fine_tuned) def get_default_user_local_den( From 104ec06aa596d3123eab8eb27bb781411b81b643 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:35:12 +0100 Subject: [PATCH 15/17] CU-869an5f00: Add small comment regarding pushing base models --- medcat-den/src/medcat_den/wrappers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat-den/src/medcat_den/wrappers.py b/medcat-den/src/medcat_den/wrappers.py index de86c29a..a46c9318 100644 --- a/medcat-den/src/medcat_den/wrappers.py +++ b/medcat-den/src/medcat_den/wrappers.py @@ -61,6 +61,7 @@ def save_model_pack( if (is_injected_for_save() and isinstance( self._den_cnf, RemoteDenConfig) and not self._den_cnf.allow_push_fine_tuned): + # NOTE: should there be a check whether this is a base model? raise CannotSendToRemoteException( "Cannot save fine-tuned model onto a remote den." "In order to make full use of the remote den capabilities, " From 6e9a7371fa8176994813cbf66d1729d181f16020 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:50:48 +0100 Subject: [PATCH 16/17] CU-869an5f00: Add more info to failure to load a registered den type --- medcat-den/src/medcat_den/resolver/resolver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-den/src/medcat_den/resolver/resolver.py b/medcat-den/src/medcat_den/resolver/resolver.py index 325dc6c5..fa97ff82 100644 --- a/medcat-den/src/medcat_den/resolver/resolver.py +++ b/medcat-den/src/medcat_den/resolver/resolver.py @@ -158,7 +158,8 @@ def resolve_from_config(config: DenConfig) -> Den: den = den_cls(cnf=config) if not isinstance(den, Den): raise ValueError( - f"Registered den class for {config.type} is not a Den") + f"Registered den class for {config.type} is not a Den. " + f"Got {type(den)}: {den}") return den else: raise ValueError( From 95b10e6d02f9b7031f14399c8ceee0aaf50bab97 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 3 Oct 2025 16:51:22 +0100 Subject: [PATCH 17/17] CU-869an5f00: Fix mocked den for python 3.12. That is, needed to make sure it is explicitly specced for the protocol for the instanceof check to pass --- medcat-den/tests/test_backend_registration.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/medcat-den/tests/test_backend_registration.py b/medcat-den/tests/test_backend_registration.py index 9ff2ea44..e5d5de09 100644 --- a/medcat-den/tests/test_backend_registration.py +++ b/medcat-den/tests/test_backend_registration.py @@ -3,9 +3,13 @@ from medcat_den.backend import DenType, _remote_den_map, register_remote_den from medcat_den.resolver import resolve +from medcat_den.den import Den -FakeDen = MagicMock +class FakeDen(MagicMock): + + def __init__(self, *args, **kw): + super().__init__(*args, **kw, spec=Den) @pytest.fixture()