From 24a6364b72756bef3c138c96150aca69b1caae93 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 16 Oct 2025 18:03:05 +0200 Subject: [PATCH 1/9] [WIP] TST: Clean up testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is still some chaos in our test suite, despite recent efforts to refactor it. This PR tries to improve the situation a bit. Currently, some tests are expected to fail. This can be due to errors in this PR but some errors may be genuine bugs. I'll investigate further. Some of the changes: - don't add return to pytest.skip, it's not necessary - avoid self.skipTest, always use pytest.skip - unused CONFIG_TESTING_KWARGS is removed - factor out common skipping logic into dedicated functions - many tests had code like: skip unless it's LoRA or IA³ or ... often, new PEFT methdos would actually work but were not added to this list, resulting in unnecessary skips --- tests/test_decoder_models.py | 4 - tests/test_encoder_decoder_models.py | 4 - tests/test_feature_extraction_models.py | 4 - tests/test_gpu_examples.py | 2 +- tests/test_mixed.py | 2 +- tests/test_seq_classifier.py | 4 - tests/test_target_parameters.py | 4 - tests/testing_common.py | 384 ++++++------------------ 8 files changed, 89 insertions(+), 319 deletions(-) diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index ed7ff2b9e1..0a88ef932f 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -316,10 +316,6 @@ def _skip_alora_no_activation(config_cls, config_kwargs): class TestDecoderModels(PeftCommonTester): transformers_class = AutoModelForCausalLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index d940d0f9a1..ddf87cfe13 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -219,10 +219,6 @@ class TestEncoderDecoderModels(PeftCommonTester): transformers_class = AutoModelForSeq2SeqLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) decoder_input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index d7dd604c97..eb1b2936f8 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -249,10 +249,6 @@ class TestPeftFeatureExtractionModel(PeftCommonTester): transformers_class = AutoModel - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index e37d78d49e..9573c2aca2 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -4557,7 +4557,7 @@ def _test_model(self, model, precision): input_ids = torch.randint(0, 1000, (2, 10)).to(self.device) if precision == torch.bfloat16: if not is_bf16_available(): - self.skipTest("Bfloat16 not supported on this device") + pytest.skip("Bfloat16 not supported on this device") # Forward pass with test precision with torch.autocast(enabled=True, dtype=precision, device_type=self.device): diff --git a/tests/test_mixed.py b/tests/test_mixed.py index 7ec18387c8..d7b663c182 100644 --- a/tests/test_mixed.py +++ b/tests/test_mixed.py @@ -526,7 +526,7 @@ def test_target_first_layer_same_type(self, config0, config1): def test_deeply_nested(self): # a somewhat absurdly nested model using different adapter types if platform.system() == "Linux": - self.skipTest("This test fails but only on GitHub CI with Linux systems.") + pytest.skip("This test fails but only on GitHub CI with Linux systems.") atol = 1e-5 rtol = 1e-5 diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index eb0a3d38a4..a59f119c84 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -225,10 +225,6 @@ class TestSequenceClassificationModels(PeftCommonTester): transformers_class = AutoModelForSequenceClassification - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_target_parameters.py b/tests/test_target_parameters.py index adffbce0d5..64297daf3c 100644 --- a/tests/test_target_parameters.py +++ b/tests/test_target_parameters.py @@ -169,10 +169,6 @@ class TestDecoderModelsTargetParameters(PeftCommonTester): # generally, nothing is broken. transformers_class = MyAutoModelForCausalLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/testing_common.py b/tests/testing_common.py index 9c49119bf2..f9df1d8389 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -33,26 +33,18 @@ from peft import ( AdaLoraConfig, BOFTConfig, - BoneConfig, CPTConfig, - FourierFTConfig, - HRAConfig, IA3Config, LNTuningConfig, LoHaConfig, LoKrConfig, LoraConfig, - MissConfig, - OFTConfig, PeftModel, - PeftType, PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, - RandLoraConfig, - VBLoRAConfig, - VeraConfig, + RoadConfig, get_peft_model, get_peft_model_state_dict, inject_adapter_in_model, @@ -72,124 +64,29 @@ from .testing_utils import get_state_dict, hub_online_once -CONFIG_TESTING_KWARGS = ( - # IA³ - { - "target_modules": None, - "feedforward_modules": None, - }, - # LoRA - { - "r": 8, - "lora_alpha": 32, - "target_modules": None, - "lora_dropout": 0.05, - "bias": "none", - }, - # prefix tuning - { - "num_virtual_tokens": 10, - }, - # prompt encoder - { - "num_virtual_tokens": 10, - "encoder_hidden_size": 32, - }, - # prompt tuning - { - "num_virtual_tokens": 10, - }, - # AdaLoRA - { - "target_modules": None, - "total_step": 1, - }, - # BOFT - { - "target_modules": None, - }, - # VeRA - { - "r": 8, - "target_modules": None, - "vera_dropout": 0.05, - "projection_prng_key": 0xFF, - "d_initial": 0.1, - "save_projection": True, - "bias": "none", - }, - # FourierFT - { - "n_frequency": 10, - "target_modules": None, - }, - # HRA - { - "target_modules": None, - }, - # VBLoRA - {"target_modules": None, "vblora_dropout": 0.05, "vector_length": 1, "num_vectors": 2}, - # OFT - { - "target_modules": None, - }, - # Bone - { - "target_modules": None, - "r": 2, - }, - # MiSS - { - "target_modules": None, - "r": 2, - }, - # LoRA + trainable_tokens - { - "r": 8, - "lora_alpha": 32, - "target_modules": None, - "lora_dropout": 0.05, - "bias": "none", - "trainable_token_indices": [0, 1, 3], - }, - # RandLoRA - { - "r": 32, - "randlora_alpha": 64, - "target_modules": None, - "randlora_dropout": 0.05, - "projection_prng_key": 0xFF, - "save_projection": True, - "bias": "none", - }, - # CPT tuninig - { - "cpt_token_ids": [0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing - "cpt_mask": [1, 1, 1, 1, 1, 1, 1, 1], - "cpt_tokens_type_mask": [1, 2, 2, 2, 3, 3, 4, 4], - }, -) +def _skip_if_merging_not_supported(config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + pytest.skip("Prompt learning does not support merging, skipping this test.") + if config_kwargs.get("alora_invocation_tokens") is not None: + pytest.skip("Test not applicable for Activated LoRA") + + +def _skip_if_adding_weighted_adapters_not_supported(config): + if not isinstance(config, (IA3Config, LoraConfig)): + pytest.skip("This PEFT method does not support adding weighted adapters, skipping this test.") + + +def _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + pytest.skip("Prompt learning does not support deletion of adapters, skipping this test.") -CLASSES_MAPPING = { - "ia3": (IA3Config, CONFIG_TESTING_KWARGS[0]), - "lora": (LoraConfig, CONFIG_TESTING_KWARGS[1]), - "prefix_tuning": (PrefixTuningConfig, CONFIG_TESTING_KWARGS[2]), - "prompt_encoder": (PromptEncoderConfig, CONFIG_TESTING_KWARGS[3]), - "prompt_tuning": (PromptTuningConfig, CONFIG_TESTING_KWARGS[4]), - "adalora": (AdaLoraConfig, CONFIG_TESTING_KWARGS[5]), - "boft": (BOFTConfig, CONFIG_TESTING_KWARGS[6]), - "vera": (VeraConfig, CONFIG_TESTING_KWARGS[7]), - "fourierft": (FourierFTConfig, CONFIG_TESTING_KWARGS[8]), - "hra": (HRAConfig, CONFIG_TESTING_KWARGS[9]), - "vblora": (VBLoRAConfig, CONFIG_TESTING_KWARGS[10]), - "oft": (OFTConfig, CONFIG_TESTING_KWARGS[11]), - "bone": (BoneConfig, CONFIG_TESTING_KWARGS[12]), - "miss": (MissConfig, CONFIG_TESTING_KWARGS[12]), - "lora+trainable_tokens": (LoraConfig, CONFIG_TESTING_KWARGS[13]), - "randlora": (RandLoraConfig, CONFIG_TESTING_KWARGS[14]), -} - -DECODER_MODELS_EXTRA = {"cpt": (CPTConfig, CONFIG_TESTING_KWARGS[15])} + +def _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs): + if "gpt2" not in model_id.lower(): + return + + if config_cls not in (IA3Config, LoHaConfig, LoKrConfig, LoraConfig): + pytest.skip("This PEFT method does not support Conv1D layers, skipping this test.") class PeftCommonTester: @@ -297,7 +194,7 @@ def _test_adapter_name(self, model_id, config_cls, config_kwargs): def _test_prepare_for_training(self, model_id, config_cls, config_kwargs): if config_kwargs.get("trainable_token_indices", None) is not None: # incompatible because trainable tokens is marking embeddings as trainable - self.skipTest("Trainable tokens is incompatible with this test.") + pytest.skip("Trainable tokens is incompatible with this test.") # some tests require specific tokenizers, make sure that they can be fetched as well with hub_online_once(model_id + config_kwargs.get("tokenizer_name_or_path", "")): @@ -452,7 +349,7 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs, safe_serial def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs, safe_serialization=True): if issubclass(config_cls, AdaLoraConfig): # AdaLora does not support adding more than 1 adapter - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") # ensure that the weights are randomly initialized if issubclass(config_cls, LoraConfig): @@ -587,20 +484,10 @@ def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): assert load_result2.missing_keys == [] def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): - if ( - config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) - or config_kwargs.get("alora_invocation_tokens") is not None - ): - # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) - if config_kwargs.get("alora_invocation_tokens") is None: - return pytest.skip(f"Test not applicable for {config_cls}") - else: - return pytest.skip("Test not applicable for Activated LoRA") - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) if (self.torch_device in ["cpu"]) and (version.parse(torch.__version__) <= version.parse("2.1")): - self.skipTest("PyTorch 2.1 not supported for Half of addmm_impl_cpu_ ") + pytest.skip("PyTorch 2.1 not supported for Half of addmm_impl_cpu_ ") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.float16) @@ -617,27 +504,8 @@ def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): _ = model.merge_and_unload() def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): - if ( - config_cls - not in ( - LoraConfig, - IA3Config, - AdaLoraConfig, - LoHaConfig, - LoKrConfig, - VeraConfig, - FourierFTConfig, - ) - or config_kwargs.get("alora_invocation_tokens") is not None - ): - # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) - return - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - - if "gemma" in model_id.lower(): - # TODO: could be related to tied weights - self.skipTest("Merging currently fails with gemma") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -704,21 +572,8 @@ def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): model = model.merge_and_unload(safe_merge=True) def _test_merge_layers(self, model_id, config_cls, config_kwargs): - if issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - - if issubclass(config_cls, (OFTConfig, BOFTConfig)): - return pytest.skip(f"Test not applicable for {config_cls}") - - if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip("Merging not applicable to aLoRA") - - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - - if "gemma" in model_id.lower(): - # TODO: could be related to tied weights - self.skipTest("Merging currently fails with gemma") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -792,23 +647,12 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol) def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.HRA, - PeftType.BONE, - PeftType.MISS, - ] - - if ("gpt2" in model_id.lower()) and (config_cls == IA3Config): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - + _skip_if_merging_not_supported(config_cls, config_kwargs) + if issubclass(config_cls, AdaLoraConfig): + # AdaLora does not support adding more than 1 adapter + pytest.skip("AdaLoRA does not support multiple adapters, skipping this test.") if config_kwargs.get("trainable_token_indices", None) is not None: - self.skipTest( + pytest.skip( "Merging two adapters with trainable tokens is tested elsewhere since adapters with " "the same token indices cannot be merged." ) @@ -818,9 +662,6 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): **config_kwargs, ) - if config.peft_type not in supported_peft_types or config_kwargs.get("alora_invocation_tokens") is not None: - return - with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) model = get_peft_model(model, config) @@ -880,9 +721,9 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3) def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): - if config_kwargs.get("alora_invocation_tokens") is not None: - # Merging not supported for Activated LoRA (aLoRA) - return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) + with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) config = config_cls( @@ -905,9 +746,8 @@ def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_0, logits_1, atol=1e-6, rtol=1e-6) def _test_safe_merge(self, model_id, config_cls, config_kwargs): - if config_kwargs.get("alora_invocation_tokens") is not None: - # Merging not supported for Activated LoRA (aLoRA) - return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + _skip_if_merging_not_supported(config_cls, config_kwargs) + torch.manual_seed(0) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -952,8 +792,8 @@ def _test_safe_merge(self, model_id, config_cls, config_kwargs): def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): # Test for mixing different adapters in a single batch by passing the adapter_names argument - if config_cls not in (LoraConfig,): - return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + if config_cls not in (LoraConfig, RoadConfig): + pytest.skip(f"Mixed adapter batches not supported for {config_cls}") config = config_cls( base_model_name_or_path=model_id, @@ -1016,14 +856,14 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, config_cls, config_kwargs): # Test generating with beam search and with mixing different adapters in a single batch by passing the # adapter_names argument. See #2283. - if config_cls not in (LoraConfig,): - return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + if config_cls not in (LoraConfig, RoadConfig): + pytest.skip(f"Mixed adapter batches not supported for {config_cls}") if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported + pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. - return pytest.skip("Trainable token indices is not supported here (yet).") + pytest.skip("Trainable token indices is not supported here (yet).") config = config_cls( base_model_name_or_path=model_id, @@ -1135,12 +975,6 @@ def _test_generate_pos_args(self, model_id, config_cls, config_kwargs, raises_er _ = model.generate(inputs["input_ids"]) def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): - if config_cls not in (IA3Config, LoraConfig, PrefixTuningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - - if self.torch_device == "mps": # BFloat16 is not supported on MPS - return pytest.skip("BFloat16 is not supported on MPS") - with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16) config = config_cls( @@ -1158,7 +992,7 @@ def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_kwargs): if config_cls not in (PrefixTuningConfig,): - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") config = config_cls( base_model_name_or_path=model_id, @@ -1173,11 +1007,9 @@ def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_ assert model.base_model_torch_dtype == torch.float16 def _test_training(self, model_id, config_cls, config_kwargs): - if issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("AdaLora with RoBERTa does not work correctly") + pytest.skip("AdaLora with RoBERTa does not work correctly") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1194,19 +1026,16 @@ def _test_training(self, model_id, config_cls, config_kwargs): output = model(**inputs)[0] loss = output.sum() loss.backward() - parameter_prefix = model.prefix - for n, param in model.named_parameters(): - if (parameter_prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): - assert param.grad is not None - else: - assert param.grad is None - def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): - if (config_cls == PrefixTuningConfig) and ("deberta" in model_id.lower()): - # TODO: raises an error: - # TypeError: DebertaModel.forward() got an unexpected keyword argument 'past_key_values' - self.skipTest("DeBERTa with PrefixTuning does not work correctly") + parameter_prefix = getattr(model, "prefix", None) + if parameter_prefix is not None: # can only check PEFT methods that allow to identify PEFT params + for n, param in model.named_parameters(): + if (parameter_prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): + assert param.grad is not None + else: + assert param.grad is None + def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): config = config_cls( base_model_name_or_path=model_id, **config_kwargs, @@ -1243,14 +1072,16 @@ def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig,): - return pytest.skip(f"Test not applicable for {config_cls}") + try: + config = config_cls( + base_model_name_or_path=model_id, + layers_to_transform=[0], + **config_kwargs, + ) + except TypeError: + pytest.skip("This PEFT method does not support layers_to_transform, skipping it.") + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) - config = config_cls( - base_model_name_or_path=model_id, - layers_to_transform=[0], - **config_kwargs, - ) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) model = get_peft_model(model, config) @@ -1309,21 +1140,13 @@ def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwargs): if config_cls == PrefixTuningConfig: - return pytest.skip(f"Test not applicable for {config_cls}") - - if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): - # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("AdaLora with RoBERTa does not work correctly") - - if (config_cls == OFTConfig) and ("deberta" in model_id.lower()): - # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("OFT with Deberta does not work correctly") + pytest.skip("Prefix Tuning does not support gradient checkpointing, skipping this test.") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) if not getattr(model, "supports_gradient_checkpointing", False): - return pytest.skip(f"Model {model_id} does not support gradient checkpointing") + pytest.skip(f"Model {model_id} does not support gradient checkpointing") model.gradient_checkpointing_enable() @@ -1358,9 +1181,7 @@ def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwa assert param.grad is None def _test_peft_model_device_map(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig, VBLoRAConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) config = config_cls( base_model_name_or_path=model_id, **config_kwargs, @@ -1382,7 +1203,7 @@ def _test_peft_model_device_map(self, model_id, config_cls, config_kwargs): def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): if not issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1413,28 +1234,14 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar assert param.grad is not None def _test_delete_adapter(self, model_id, config_cls, config_kwargs): - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.VERA, - PeftType.FOURIERFT, - PeftType.HRA, - PeftType.VBLORA, - PeftType.BONE, - PeftType.MISS, - ] - # IA3 does not support deleting adapters yet, but it just needs to be added - # AdaLora does not support multiple adapters + if config_cls == AdaLoraConfig: + pytest.skip("AdaLoRA does not support multiple adapters") + _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs) + config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) - if config.peft_type not in supported_peft_types: - return pytest.skip(f"Test not applicable for {config.peft_type}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1487,28 +1294,14 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs): model.base_model(**input) # should not raise an error def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): - # same as test_delete_adapter, but this time an inactive adapter is deleted - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.FOURIERFT, - PeftType.HRA, - PeftType.VBLORA, - PeftType.BONE, - PeftType.MISS, - ] - # IA3 does not support deleting adapters yet, but it just needs to be added - # AdaLora does not support multiple adapters + if config_cls == AdaLoraConfig: + pytest.skip("AdaLoRA does not support multiple adapters") + _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs) + config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) - if config.peft_type not in supported_peft_types: - return pytest.skip(f"Test not applicable for {config.peft_type}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1609,6 +1402,7 @@ def _test_unload_adapter(self, model_id, config_cls, config_kwargs): assert num_params_base == num_params_unloaded def _test_weighted_combination_of_adapters_lora(self, model, config, adapter_list, weight_list): + _skip_if_adding_weighted_adapters_not_supported(config) model.add_adapter(adapter_list[1], config) model.add_adapter(adapter_list[2], replace(config, r=20)) model = model.to(self.torch_device) @@ -1828,8 +1622,6 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw if model_id.endswith("qwen2"): # Qwen2 fails with weighted adapter combinations using SVD return pytest.skip(f"Test does not work with model {model_id}") - if "gemma" in model_id.lower(): - return pytest.skip("Combining Gemma adapters with SVD is currently failing") adapter_list = ["adapter1", "adapter_2", "adapter_3"] weight_list = [0.5, 1.5, 1.5] @@ -1860,7 +1652,7 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw def _test_disable_adapter(self, model_id, config_cls, config_kwargs): task_type = config_kwargs.get("task_type") if (task_type == "SEQ_2_SEQ_LM") and (config_cls in (PromptTuningConfig, PromptEncoderConfig)): - self.skipTest("Seq2Seq + prompt tuning/prompt encoder does not work with disabling adapters") + pytest.skip("Seq2Seq + prompt tuning/prompt encoder does not work with disabling adapters") def get_output(model): # helper function that works with different model types @@ -1939,19 +1731,17 @@ def get_output(model): # TODO: add tests to check if disabling adapters works after calling merge_adapter def _test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): - # When trying to add multiple adapters with bias in Lora, AdaLora or BOFTConfig, an error should be - # raised. Also, the peft model should not be left in a half-initialized state. - if not issubclass(config_cls, (LoraConfig, AdaLoraConfig, BOFTConfig)): - return pytest.skip(f"Test not applicable for {config_cls}") - - with hub_online_once(model_id): - config_kwargs = config_kwargs.copy() - config_kwargs["bias"] = "all" + config_kwargs = config_kwargs.copy() + config_kwargs["bias"] = "all" + try: config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) + except TypeError: + pytest.skip(f"{config_cls} does not support the 'bias' argument, skipping this test.") + with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) model = get_peft_model(model, config, "adapter0") From 918a239be901c8c5eb61569ae9a93e59192eb21b Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 17 Oct 2025 17:19:38 +0200 Subject: [PATCH 2/9] Some further fixes --- src/peft/tuners/boft/layer.py | 4 +-- src/peft/tuners/c3a/model.py | 4 +++ src/peft/tuners/ln_tuning/layer.py | 2 +- src/peft/tuners/waveft/layer.py | 5 ++-- tests/testing_common.py | 41 ++++++++++++++++++++---------- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py index 7232f39d17..d2ebb5ff78 100644 --- a/src/peft/tuners/boft/layer.py +++ b/src/peft/tuners/boft/layer.py @@ -457,10 +457,10 @@ def cayley_batch(self, data): skew_mat = 0.5 * (data - data.transpose(1, 2)) id_mat = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c) - # Perform the Cayley parametrization + # Perform the Cayley parametrization, must be in float32 Q = torch.linalg.solve(id_mat + skew_mat, id_mat - skew_mat, left=False) - return Q + return Q.to(data.dtype) class Linear(nn.Module, BOFTLayer): diff --git a/src/peft/tuners/c3a/model.py b/src/peft/tuners/c3a/model.py index 6e71973691..19c9ef763e 100644 --- a/src/peft/tuners/c3a/model.py +++ b/src/peft/tuners/c3a/model.py @@ -93,5 +93,9 @@ def _create_new_module(c3a_config, adapter_name, target, **kwargs): if isinstance(target_base_layer, torch.nn.Linear): new_module = C3ALinear(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only `torch.nn.Linear` is supported." + ) return new_module diff --git a/src/peft/tuners/ln_tuning/layer.py b/src/peft/tuners/ln_tuning/layer.py index e29149f2cb..4000e992a7 100644 --- a/src/peft/tuners/ln_tuning/layer.py +++ b/src/peft/tuners/ln_tuning/layer.py @@ -105,7 +105,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) - elif self.merged: + elif self.merged or (len(self.active_adapters) == 0): result = self.base_layer(x, *args, **kwargs) else: if len(self.active_adapters) != 1: diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index a17f3ffba3..2dbbcae327 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -21,6 +21,7 @@ from transformers.pytorch_utils import Conv1D from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose from .constants import WAVELET_REDUCTIONS from .waverec2d import waverec2d @@ -237,7 +238,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -246,7 +247,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: diff --git a/tests/testing_common.py b/tests/testing_common.py index f9df1d8389..8f71a8856d 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -45,6 +45,7 @@ PromptLearningConfig, PromptTuningConfig, RoadConfig, + VBLoRAConfig, get_peft_model, get_peft_model_state_dict, inject_adapter_in_model, @@ -540,14 +541,7 @@ def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): model = model.to(self.torch_device) for name, module in model.named_parameters(): - if ( - "lora_A" in name - or "ia3" in name - or "lora_E" in name - or "lora_B" in name - or "vera_lambda" in name - or "fourierft_spectrum" in name - ): + if model.prefix in name: module.data[0] = torch.nan with pytest.raises( @@ -606,12 +600,14 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): atol, rtol = 1e-4, 1e-4 if self.torch_device in ["mlu"]: atol, rtol = 1e-3, 1e-3 # MLU - if config.peft_type == "ADALORA": - # AdaLoRA is a bit flaky on CI, but this cannot be reproduced locally + if config.peft_type in ("ADALORA", "OFT"): + # these methods require a bit higher tolerance atol, rtol = 1e-2, 1e-2 - if (config.peft_type in {"IA3", "LORA"}) and (model_id in conv_ids): + if (config.peft_type in {"IA3", "LORA", "OFT"}) and (model_id in conv_ids): # for some reason, the Conv introduces a larger error atol, rtol = 0.3, 0.01 + if (config.peft_type == "OFT") and not model.config.is_decoder: + atol, rtol = 0.3, 0.01 if model_id == "trl-internal-testing/tiny-Llama4ForCausalLM": # also getting larger errors here, not exactly sure why atol, rtol = 0.3, 0.01 @@ -856,7 +852,8 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, config_cls, config_kwargs): # Test generating with beam search and with mixing different adapters in a single batch by passing the # adapter_names argument. See #2283. - if config_cls not in (LoraConfig, RoadConfig): + if config_cls not in (LoraConfig,): + # note: RoAD supports mixed adapter batches but not beam search pytest.skip(f"Mixed adapter batches not supported for {config_cls}") if config_kwargs.get("alora_invocation_tokens") is not None: pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported @@ -975,6 +972,8 @@ def _test_generate_pos_args(self, model_id, config_cls, config_kwargs, raises_er _ = model.generate(inputs["input_ids"]) def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) + with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16) config = config_cls( @@ -1007,6 +1006,8 @@ def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_ assert model.base_model_torch_dtype == torch.float16 def _test_training(self, model_id, config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + pytest.skip("Prompt learning does not support merging, skipping this test.") if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): # TODO: no gradients on the "dense" layer, other layers work, not sure why pytest.skip("AdaLora with RoBERTa does not work correctly") @@ -1036,6 +1037,11 @@ def _test_training(self, model_id, config_cls, config_kwargs): assert param.grad is None def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): + if (config_cls == PrefixTuningConfig) and ("deberta" in model_id.lower()): + # TODO: raises an error: + # TypeError: DebertaModel.forward() got an unexpected keyword argument 'past_key_values' + pytest.skip("DeBERTa with PrefixTuning does not work correctly") + config = config_cls( base_model_name_or_path=model_id, **config_kwargs, @@ -1115,7 +1121,11 @@ def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): ) logits_from_pretrained = model_from_pretrained(**inputs)[0][0] - assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) + if config_cls == VBLoRAConfig: + atol, rtol = 1e-3, 1e-3 + else: + atol, rtol = 1e-4, 1e-4 + assert torch.allclose(logits, logits_from_pretrained, atol=atol, rtol=rtol) # check the nb of trainable params again but without layers_to_transform model = self.transformers_class.from_pretrained(model_id) @@ -1141,6 +1151,9 @@ def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwargs): if config_cls == PrefixTuningConfig: pytest.skip("Prefix Tuning does not support gradient checkpointing, skipping this test.") + if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): + # TODO: no gradients on the "dense" layer, other layers work, not sure why + pytest.skip("AdaLora with RoBERTa does not work correctly") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1622,6 +1635,8 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw if model_id.endswith("qwen2"): # Qwen2 fails with weighted adapter combinations using SVD return pytest.skip(f"Test does not work with model {model_id}") + if "gemma" in model_id.lower(): + return pytest.skip("Combining Gemma adapters with SVD is currently failing") adapter_list = ["adapter1", "adapter_2", "adapter_3"] weight_list = [0.5, 1.5, 1.5] From 6f7d19921a68c58828dcbf0ec0bde3a958462d31 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 12:36:52 +0200 Subject: [PATCH 3/9] More fixes * more BOFT dtype fixes * fix faulty test --- src/peft/tuners/boft/layer.py | 7 +++++++ tests/testing_common.py | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py index d2ebb5ff78..e77cc6f5c7 100644 --- a/src/peft/tuners/boft/layer.py +++ b/src/peft/tuners/boft/layer.py @@ -370,6 +370,13 @@ def update_layer( self._move_adapter_to_device_of_base_layer(adapter_name) self.set_adapter(self.active_adapters, inference_mode=inference_mode) + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, **kwargs) -> None: + super()._move_adapter_to_device_of_base_layer(adapter_name=adapter_name, **kwargs) + new_device = self.boft_R[adapter_name].device + new_dtype = self.boft_R[adapter_name].dtype + if new_device != torch.device("meta"): + self.boft_P = self.boft_P.to(new_device, new_dtype) + def reset_boft_parameters(self, adapter_name, init_weights): """ Reset the BOFT parameters. diff --git a/tests/testing_common.py b/tests/testing_common.py index 5f05518192..5892bc70f5 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -597,6 +597,7 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): logits_merged_unloaded = model(**dummy_input)[0] conv_ids = ["Conv2d", "Conv3d", "Conv2d2"] + is_decoder = getattr(getattr(model, "config", None), "is_decoder", False) atol, rtol = 1e-4, 1e-4 if self.torch_device in ["mlu"]: atol, rtol = 1e-3, 1e-3 # MLU @@ -606,7 +607,7 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): if (config.peft_type in {"IA3", "LORA", "OFT"}) and (model_id in conv_ids): # for some reason, the Conv introduces a larger error atol, rtol = 0.3, 0.01 - if (config.peft_type == "OFT") and not model.config.is_decoder: + if (config.peft_type == "OFT") and not is_decoder: atol, rtol = 0.3, 0.01 if model_id == "trl-internal-testing/tiny-Llama4ForCausalLM": # also getting larger errors here, not exactly sure why From 904757539e6639cac7566b2c98c61ae647263ac8 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 13:06:09 +0200 Subject: [PATCH 4/9] FourierFT merging deal with Conv1D --- src/peft/tuners/fourierft/layer.py | 9 ++++++--- src/peft/tuners/waveft/layer.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/peft/tuners/fourierft/layer.py b/src/peft/tuners/fourierft/layer.py index a03a57f118..622420d085 100644 --- a/src/peft/tuners/fourierft/layer.py +++ b/src/peft/tuners/fourierft/layer.py @@ -21,6 +21,7 @@ from transformers.pytorch_utils import Conv1D from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose class FourierFTLayer(BaseTunerLayer): @@ -139,7 +140,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -148,7 +149,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -161,7 +162,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.fourierft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def get_delta_weight(self, adapter) -> torch.Tensor: return super().get_delta_weight(adapter) diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index 2dbbcae327..e856736428 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -260,7 +260,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.waveft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def get_delta_weight(self, adapter) -> torch.Tensor: return super().get_delta_weight(adapter) From e4b9aa725e8ae89901fcef3a7711c7a11a71bcfc Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 13:15:04 +0200 Subject: [PATCH 5/9] Fix VBLoRA test --- tests/testing_common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testing_common.py b/tests/testing_common.py index 5892bc70f5..75594f15f6 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -658,9 +658,14 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): base_model_name_or_path=model_id, **config_kwargs, ) + if config_cls == VBLoRAConfig: + # for VBLoRA, increase this value or else the two adapters are too similar + config.init_logits_std *= 100 + config.init_vector_bank_bound *= 100 with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) + torch.manual_seed(0) model = get_peft_model(model, config) model = model.to(self.torch_device) @@ -670,6 +675,7 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): with torch.inference_mode(): logits_adapter_1 = model(**dummy_input)[0] + torch.manual_seed(1) model.add_adapter("adapter-2", config) model.set_adapter("adapter-2") model.eval() From e1a44314113dda966bfa0b32d7b98de8adf5ea30 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 13:25:41 +0200 Subject: [PATCH 6/9] Move transpose into get_delta_weight --- src/peft/tuners/fourierft/layer.py | 12 ++++-------- src/peft/tuners/waveft/layer.py | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/peft/tuners/fourierft/layer.py b/src/peft/tuners/fourierft/layer.py index 622420d085..9f4458135c 100644 --- a/src/peft/tuners/fourierft/layer.py +++ b/src/peft/tuners/fourierft/layer.py @@ -93,7 +93,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: dense_spectrum = torch.zeros(self.out_features, self.in_features, device=spectrum.device) dense_spectrum[indices[0, :], indices[1, :]] = spectrum.float() delta_weight = torch.fft.ifft2(dense_spectrum).real * self.fourierft_scaling[adapter] - return delta_weight.to(spectrum.dtype) + return transpose(delta_weight.to(spectrum.dtype), self.fan_in_fan_out) class FourierFTLinear(nn.Module, FourierFTLayer): @@ -140,7 +140,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) + orig_weights += self.get_delta_weight(active_adapter) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -149,7 +149,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) + base_layer.weight.data += self.get_delta_weight(active_adapter) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -162,13 +162,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.fourierft_spectrum.keys(): - self.get_base_layer().weight.data -= transpose( - self.get_delta_weight(active_adapter), self.fan_in_fan_out + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) ) - def get_delta_weight(self, adapter) -> torch.Tensor: - return super().get_delta_weight(adapter) - def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index e856736428..8a74e61306 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -189,6 +189,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: dense_spectrum[indices[0, :], indices[1, :]] = spectrum delta_weight = dense_spectrum * self.waveft_scaling[adapter] + delta_weight = transpose(delta_weight, self.fan_in_fan_out) return delta_weight @@ -238,7 +239,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) + orig_weights += self.get_delta_weight(active_adapter), self.fan_in_fan_out if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -247,7 +248,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) + base_layer.weight.data += self.get_delta_weight(active_adapter), self.fan_in_fan_out self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -260,12 +261,7 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.waveft_spectrum.keys(): - self.get_base_layer().weight.data -= transpose( - self.get_delta_weight(active_adapter), self.fan_in_fan_out - ) - - def get_delta_weight(self, adapter) -> torch.Tensor: - return super().get_delta_weight(adapter) + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype From f0de8521474dfdb90dd7edad3f0c194bbb1ca138 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 14:06:44 +0200 Subject: [PATCH 7/9] fix error --- src/peft/tuners/fourierft/layer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/peft/tuners/fourierft/layer.py b/src/peft/tuners/fourierft/layer.py index 9f4458135c..7f65ad0737 100644 --- a/src/peft/tuners/fourierft/layer.py +++ b/src/peft/tuners/fourierft/layer.py @@ -163,7 +163,6 @@ def unmerge(self) -> None: active_adapter = self.merged_adapters.pop() if active_adapter in self.fourierft_spectrum.keys(): self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) - ) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype From e5ee3bd810d3d6bbc938156fd895aae422ba3f9b Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 20 Oct 2025 15:22:33 +0200 Subject: [PATCH 8/9] Faulty code fixex --- src/peft/tuners/waveft/layer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index 8a74e61306..e76291c132 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -239,7 +239,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter), self.fan_in_fan_out + orig_weights += self.get_delta_weight(active_adapter) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -248,7 +248,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter), self.fan_in_fan_out + base_layer.weight.data += self.get_delta_weight(active_adapter) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: From 533da3f67ce82c9702570c1bf4b1b4f7cfe0ec18 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 21 Oct 2025 15:34:48 +0200 Subject: [PATCH 9/9] Fixes of fixes of BOFT, WaveFT, FourierFT --- src/peft/tuners/boft/layer.py | 11 ++--------- src/peft/tuners/fourierft/layer.py | 10 ++++++---- src/peft/tuners/waveft/layer.py | 9 +++++---- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py index e77cc6f5c7..470ce12312 100644 --- a/src/peft/tuners/boft/layer.py +++ b/src/peft/tuners/boft/layer.py @@ -370,13 +370,6 @@ def update_layer( self._move_adapter_to_device_of_base_layer(adapter_name) self.set_adapter(self.active_adapters, inference_mode=inference_mode) - def _move_adapter_to_device_of_base_layer(self, adapter_name: str, **kwargs) -> None: - super()._move_adapter_to_device_of_base_layer(adapter_name=adapter_name, **kwargs) - new_device = self.boft_R[adapter_name].device - new_dtype = self.boft_R[adapter_name].dtype - if new_device != torch.device("meta"): - self.boft_P = self.boft_P.to(new_device, new_dtype) - def reset_boft_parameters(self, adapter_name, init_weights): """ Reset the BOFT parameters. @@ -593,7 +586,7 @@ def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) - boft_P = self.boft_P.to(block_diagonal_butterfly.device) + boft_P = self.boft_P.to(block_diagonal_butterfly.device, block_diagonal_butterfly.dtype) butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) butterfly_oft_mat = butterfly_oft_mat_batch[0] @@ -926,7 +919,7 @@ def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) - boft_P = self.boft_P.to(block_diagonal_butterfly.device) + boft_P = self.boft_P.to(block_diagonal_butterfly.device, block_diagonal_butterfly.dtype) butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) butterfly_oft_mat = butterfly_oft_mat_batch[0] diff --git a/src/peft/tuners/fourierft/layer.py b/src/peft/tuners/fourierft/layer.py index 7f65ad0737..f95a414db9 100644 --- a/src/peft/tuners/fourierft/layer.py +++ b/src/peft/tuners/fourierft/layer.py @@ -93,7 +93,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: dense_spectrum = torch.zeros(self.out_features, self.in_features, device=spectrum.device) dense_spectrum[indices[0, :], indices[1, :]] = spectrum.float() delta_weight = torch.fft.ifft2(dense_spectrum).real * self.fourierft_scaling[adapter] - return transpose(delta_weight.to(spectrum.dtype), self.fan_in_fan_out) + return delta_weight.to(spectrum.dtype) class FourierFTLinear(nn.Module, FourierFTLayer): @@ -140,7 +140,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -149,7 +149,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -162,7 +162,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.fourierft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index e76291c132..c5030e4a16 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -189,7 +189,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor: dense_spectrum[indices[0, :], indices[1, :]] = spectrum delta_weight = dense_spectrum * self.waveft_scaling[adapter] - delta_weight = transpose(delta_weight, self.fan_in_fan_out) return delta_weight @@ -239,7 +238,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -248,7 +247,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -261,7 +260,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.waveft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype