From 207229ad5e1854cd897306ce5ee725b0245b1064 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Thu, 26 Oct 2023 06:51:49 -0700
Subject: [PATCH 01/65] FIX Conv1D merge error for IA3 (#1014)

---
 src/peft/tuners/ia3/layer.py |  3 ++
 src/peft/tuners/ia3/model.py |  6 +++-
 tests/test_custom_models.py  | 57 ++++++++++++++++++++----------------
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index c35f3d875c..18a10c88a3 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -74,6 +74,7 @@ def __init__(
         out_features: int,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        is_target_conv_1d_layer: bool = False,  # whether target module is a conv1d layer. useful while unloading later
         **kwargs,
     ) -> None:
         init_ia3_weights = kwargs.pop("init_ia3_weights", True)
@@ -88,6 +89,8 @@ def __init__(
         if fan_in_fan_out:
             self.weight.data = self.weight.data.T
 
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+
         nn.Linear.reset_parameters(self)
         self.update_layer(adapter_name, init_ia3_weights)
         self.set_adapter(adapter_name)
diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index f4a80e8cbc..023286d42f 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -150,6 +150,7 @@ def _create_new_module(ia3_config, adapter_name, target, **kwargs):
                 in_features, out_features = (
                     target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
                 )
+                kwargs["is_target_conv_1d_layer"] = True  # useful for unloading later
                 if not kwargs["fan_in_fan_out"]:
                     warnings.warn(
                         "fan_in_fan_out is set to False but the target module is `Conv1D`. "
@@ -330,7 +331,10 @@ def merge_and_unload(self, safe_merge: bool = False):
                 )
             else:
                 bias = target.bias is not None
-                new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
+                if getattr(target, "is_target_conv_1d_layer", False):
+                    new_module = Conv1D(target.out_features, target.in_features)
+                else:
+                    new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
 
             target.merge(safe_merge=safe_merge)
             self._replace_module(parent, target_name, new_module, target)
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 9cb867b644..d3802d08bd 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -85,27 +85,26 @@
         IA3Config,
         {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "feedforward_modules": ["lin0"]},
     ),
-    # TODO: There are errors when trying to merge Conv1D, hence skipping them for now
-    # (
-    #     "transformers Conv1D 1 IA3",
-    #     "EmbConv1D",
-    #     IA3Config,
-    #     {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"]},
-    # ),
-    # (
-    #     "transformers Conv1D 2 IA3",
-    #     "EmbConv1D",
-    #     IA3Config,
-    #     {"target_modules": ["conv1d", "lin0"], "feedforward_modules": ["conv1d", "lin0"]},
-    # ),
-    # (
-    #     "transformers Conv1D 1 IA3",
-    #     "EmbConv1D",
-    #     IA3Config,
-    #     {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"], "modules_to_save": ["lin1"]},
-    # ),
-    # ("Conv2d 1 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": []}),
-    # ("Conv2d 2 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": ["conv2d"]}),
+    (
+        "transformers Conv1D 1 IA3",
+        "EmbConv1D",
+        IA3Config,
+        {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"]},
+    ),
+    (
+        "transformers Conv1D 2 IA3",
+        "EmbConv1D",
+        IA3Config,
+        {"target_modules": ["conv1d", "lin0"], "feedforward_modules": ["conv1d", "lin0"]},
+    ),
+    (
+        "transformers Conv1D 1 IA3",
+        "EmbConv1D",
+        IA3Config,
+        {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"], "modules_to_save": ["lin1"]},
+    ),
+    ("Conv2d 1 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": []}),
+    ("Conv2d 2 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": ["conv2d"]}),
     (
         "Conv2d 3 IA3",
         "Conv2d",
@@ -547,9 +546,9 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         outputs_before = model(**X)
 
         model.train()
-        # EmbConv1D is slow to learn for some reason
-        lr = 0.01 if model_id != "EmbConv1D" else 1.0
-        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+        lr = 0.01
+        # Adam optimizer since SGD isn't great for small models with IA3 + Conv1D
+        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
         # breaking of some LoRA layers that are initialized with constants)
@@ -571,12 +570,18 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co
         # check that after leaving the disable_adapter context, everything is enabled again
         outputs_enabled_after_disable = model(**X)
 
-        atol, rtol = 1e-5, 1e-5  # merging introduces some numerical instability
-        if issubclass(config_cls, IA3Config):  # IA³ introduces more instability
+        atol, rtol = 1e-5, 1e-5  # tolerances higher than defaults since merging introduces some numerical instability
+
+        if issubclass(config_cls, IA3Config) and model_id == "Conv2d":  # more instability with Conv2d + IA3
             atol, rtol = 1e-3, 1e-3
 
+        # check that there is a difference in results after training
         self.assertFalse(torch.allclose(outputs_before, outputs_after, atol=atol, rtol=rtol))
+
+        # check that disabling adapters gives the same results as before training
         self.assertTrue(torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol))
+
+        # check that enabling + disabling adapters does not change the results
         self.assertTrue(torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol))
 
     @parameterized.expand(TEST_CASES)

From 884b1ac3a8ef49c9301b5bbf02e8bc64349e95f9 Mon Sep 17 00:00:00 2001
From: Alexander Kovalchuk <kovalexal@gmail.com>
Date: Mon, 30 Oct 2023 15:36:41 +0100
Subject: [PATCH 02/65] Add implementation of LyCORIS LoKr for SD&SDXL models
 (#978)

KronA-like adapter
---
 README.md                                     |   9 +-
 .../convert_sd_adapter_to_peft.py             | 185 +++++++-
 ...dreambooth_loha.py => train_dreambooth.py} | 280 +++++++++---
 src/peft/__init__.py                          |   2 +
 src/peft/mapping.py                           |   4 +
 src/peft/peft_model.py                        |   2 +
 src/peft/tuners/__init__.py                   |   1 +
 src/peft/tuners/ia3/layer.py                  |   4 -
 src/peft/tuners/loha/config.py                |  22 +-
 src/peft/tuners/loha/layer.py                 | 126 +-----
 src/peft/tuners/loha/model.py                 | 199 +--------
 src/peft/tuners/lokr/__init__.py              |  21 +
 src/peft/tuners/lokr/config.py                | 112 +++++
 src/peft/tuners/lokr/layer.py                 | 373 ++++++++++++++++
 src/peft/tuners/lokr/model.py                 |  85 ++++
 src/peft/tuners/lora/layer.py                 |   4 -
 src/peft/tuners/lycoris_utils.py              | 407 ++++++++++++++++++
 src/peft/tuners/tuners_utils.py               |   9 +-
 src/peft/utils/peft_types.py                  |   1 +
 src/peft/utils/save_and_load.py               |   6 +-
 tests/test_custom_models.py                   |  49 ++-
 tests/test_stablediffusion.py                 |   4 +-
 22 files changed, 1505 insertions(+), 400 deletions(-)
 rename examples/stable_diffusion/{train_dreambooth_loha.py => train_dreambooth.py} (84%)
 create mode 100644 src/peft/tuners/lokr/__init__.py
 create mode 100644 src/peft/tuners/lokr/config.py
 create mode 100644 src/peft/tuners/lokr/layer.py
 create mode 100644 src/peft/tuners/lokr/model.py
 create mode 100644 src/peft/tuners/lycoris_utils.py

diff --git a/README.md b/README.md
index 1135f124df..d4dfee5c38 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ Supported methods:
 6. $(IA)^3$: [Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning](https://arxiv.org/abs/2205.05638)
 7. MultiTask Prompt Tuning: [Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning](https://arxiv.org/abs/2303.02861)
 8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
+9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
 
 ## Getting started
 
@@ -134,7 +135,7 @@ Try out the 🤗 Gradio Space which should run seamlessly on a T4 instance:
 **NEW** ✨ Multi Adapter support and combining multiple LoRA adapters in a weighted combination 
 ![peft lora dreambooth weighted adapter](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/weighted_adapter_dreambooth_lora.png)
 
-**NEW** ✨ Dreambooth training for Stable Diffusion using LoHa adapter [`examples/stable_diffusion/train_dreambooth_loha.py`](examples/stable_diffusion/train_dreambooth_loha.py)
+**NEW** ✨ Dreambooth training for Stable Diffusion using LoHa and LoKr adapters [`examples/stable_diffusion/train_dreambooth.py`](examples/stable_diffusion/train_dreambooth.py)
 
 ### Parameter Efficient Tuning of LLMs for RLHF components such as Ranker and Policy
 - Here is an example in [trl](https://github.com/lvwerra/trl) library using PEFT+INT8 for tuning policy model: [gpt2-sentiment_peft.py](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment_peft.py) and corresponding [Blog](https://huggingface.co/blog/trl-peft)
@@ -273,9 +274,9 @@ An example is provided in `~examples/causal_language_modeling/peft_lora_clm_acce
 
 ### Text-to-Image Generation
 
-|   Model         | LoRA | LoHa | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
-| --------- | ---- | ---- | ---- | ---- | ----  | ----  |
-| Stable Diffusion           | ✅  | ✅  |   |   |   |
+|   Model         | LoRA | LoHa | LoKr | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  | ✅  | ✅  |  |   |   |
 
 
 ### Image Classification
diff --git a/examples/stable_diffusion/convert_sd_adapter_to_peft.py b/examples/stable_diffusion/convert_sd_adapter_to_peft.py
index 0d1fa05e5c..3150d9e748 100644
--- a/examples/stable_diffusion/convert_sd_adapter_to_peft.py
+++ b/examples/stable_diffusion/convert_sd_adapter_to_peft.py
@@ -1,16 +1,20 @@
 import argparse
+import json
 import logging
 import os
 from collections import Counter
 from dataclasses import dataclass
+from operator import attrgetter
 from typing import Dict, List, Optional, Union
 
 import safetensors
 import torch
+import torch.nn as nn
 from diffusers import UNet2DConditionModel
 from transformers import CLIPTextModel
 
-from peft import LoHaConfig, LoraConfig, PeftType, get_peft_model, set_peft_model_state_dict
+from peft import LoHaConfig, LoKrConfig, LoraConfig, PeftType, get_peft_model, set_peft_model_state_dict
+from peft.tuners.lokr.layer import factorization
 
 
 # Default kohya_ss LoRA replacement modules
@@ -74,7 +78,48 @@ def peft_state_dict(self) -> Dict[str, torch.Tensor]:
         return state_dict
 
 
-def construct_peft_loraconfig(info: Dict[str, LoRAInfo]) -> LoraConfig:
+@dataclass
+class LoKrInfo:
+    kohya_key: str
+    peft_key: str
+    alpha: Optional[float] = None
+    rank: Optional[int] = None
+    lokr_w1: Optional[torch.Tensor] = None
+    lokr_w1_a: Optional[torch.Tensor] = None
+    lokr_w1_b: Optional[torch.Tensor] = None
+    lokr_w2: Optional[torch.Tensor] = None
+    lokr_w2_a: Optional[torch.Tensor] = None
+    lokr_w2_b: Optional[torch.Tensor] = None
+    lokr_t2: Optional[torch.Tensor] = None
+
+    def peft_state_dict(self) -> Dict[str, torch.Tensor]:
+        if (self.lokr_w1 is None) and ((self.lokr_w1_a is None) or (self.lokr_w1_b is None)):
+            raise ValueError("Either lokr_w1 or both lokr_w1_a and lokr_w1_b should be provided")
+
+        if (self.lokr_w2 is None) and ((self.lokr_w2_a is None) or (self.lokr_w2_b is None)):
+            raise ValueError("Either lokr_w2 or both lokr_w2_a and lokr_w2_b should be provided")
+
+        state_dict = {}
+
+        if self.lokr_w1 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1"] = self.lokr_w1
+        elif self.lokr_w1_a is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1_a"] = self.lokr_w1_a
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w1_b"] = self.lokr_w1_b
+
+        if self.lokr_w2 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2"] = self.lokr_w2
+        elif self.lokr_w2_a is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2_a"] = self.lokr_w2_a
+            state_dict[f"base_model.model.{self.peft_key}.lokr_w2_b"] = self.lokr_w2_b
+
+        if self.lokr_t2 is not None:
+            state_dict[f"base_model.model.{self.peft_key}.lokr_t2"] = self.lokr_t2
+
+        return state_dict
+
+
+def construct_peft_loraconfig(info: Dict[str, LoRAInfo], **kwargs) -> LoraConfig:
     """Constructs LoraConfig from data extracted from adapter checkpoint
 
     Args:
@@ -113,7 +158,7 @@ def construct_peft_loraconfig(info: Dict[str, LoRAInfo]) -> LoraConfig:
     return config
 
 
-def construct_peft_lohaconfig(info: Dict[str, LoHaInfo]) -> LoHaConfig:
+def construct_peft_lohaconfig(info: Dict[str, LoHaInfo], **kwargs) -> LoHaConfig:
     """Constructs LoHaConfig from data extracted from adapter checkpoint
 
     Args:
@@ -156,6 +201,77 @@ def construct_peft_lohaconfig(info: Dict[str, LoHaInfo]) -> LoHaConfig:
     return config
 
 
+def construct_peft_lokrconfig(info: Dict[str, LoKrInfo], decompose_factor: int = -1, **kwargs) -> LoKrConfig:
+    """Constructs LoKrConfig from data extracted from adapter checkpoint
+
+    Args:
+        info (Dict[str, LoKrInfo]): Information extracted from adapter checkpoint
+
+    Returns:
+        LoKrConfig: config for constructing LoKr
+    """
+
+    # Unpack all ranks and alphas
+    ranks = {x[0]: x[1].rank for x in info.items()}
+    alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()}
+
+    # Determine which modules needs to be transformed
+    target_modules = sorted(info.keys())
+
+    # Determine most common rank and alpha
+    r = int(Counter(ranks.values()).most_common(1)[0][0])
+    alpha = Counter(alphas.values()).most_common(1)[0][0]
+
+    # Determine which modules have different rank and alpha
+    rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0]))
+    alpha_pattern = dict(sorted(filter(lambda x: x[1] != alpha, alphas.items()), key=lambda x: x[0]))
+
+    # Determine whether any of modules have effective conv2d decomposition
+    use_effective_conv2d = any(((val.lokr_t2 is not None) for val in info.values()))
+
+    # decompose_both should be enabled if any w1 matrix in any layer is decomposed into 2
+    decompose_both = any((val.lokr_w1_a is not None and val.lokr_w1_b is not None) for val in info.values())
+
+    # Determining decompose factor is a bit tricky (but it is most often -1)
+    # Check that decompose_factor is equal to provided
+    for val in info.values():
+        # Determine shape of first matrix
+        if val.lokr_w1 is not None:
+            w1_shape = tuple(val.lokr_w1.shape)
+        else:
+            w1_shape = (val.lokr_w1_a.shape[0], val.lokr_w1_b.shape[1])
+
+        # Determine shape of second matrix
+        if val.lokr_w2 is not None:
+            w2_shape = tuple(val.lokr_w2.shape[:2])
+        elif val.lokr_t2 is not None:
+            w2_shape = (val.lokr_w2_a.shape[1], val.lokr_w2_b.shape[1])
+        else:
+            # We may iterate over Conv2d layer, for which second item in shape is multiplied by ksize^2
+            w2_shape = (val.lokr_w2_a.shape[0], val.lokr_w2_b.shape[1])
+
+        # We need to check, whether decompose_factor is really -1 or not
+        shape = (w1_shape[0], w2_shape[0])
+        if factorization(shape[0] * shape[1], factor=-1) != shape:
+            raise ValueError("Cannot infer decompose_factor, probably it is not equal to -1")
+
+    config = LoKrConfig(
+        r=r,
+        alpha=alpha,
+        target_modules=target_modules,
+        rank_dropout=0.0,
+        module_dropout=0.0,
+        init_weights=False,
+        rank_pattern=rank_pattern,
+        alpha_pattern=alpha_pattern,
+        use_effective_conv2d=use_effective_conv2d,
+        decompose_both=decompose_both,
+        decompose_factor=decompose_factor,
+    )
+
+    return config
+
+
 def combine_peft_state_dict(info: Dict[str, Union[LoRAInfo, LoHaInfo]]) -> Dict[str, torch.Tensor]:
     result = {}
     for key_info in info.values():
@@ -180,7 +296,7 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
         elif any(x in key for x in ["lokr_w1", "lokr_w2", "lokr_t1", "lokr_t2"]):
             # LoKr may have the following keys:
             # lokr_w1, lokr_w2, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t1, lokr_t2
-            raise ValueError("Currently LoKr adapters are not implemented")
+            return PeftType.LOKR
         elif "diff" in key:
             raise ValueError("Currently full diff adapters are not implemented")
         else:
@@ -231,22 +347,40 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
             }
         )
 
-    # Store conversion info (model_type -> peft_key -> LoRAInfo | LoHaInfo)
-    adapter_info: Dict[str, Dict[str, Union[LoRAInfo, LoHaInfo]]] = {
+    # Store conversion info (model_type -> peft_key -> LoRAInfo | LoHaInfo | LoKrInfo)
+    adapter_info: Dict[str, Dict[str, Union[LoRAInfo, LoHaInfo, LoKrInfo]]] = {
         "text_encoder": {},
         "unet": {},
     }
 
+    # Store decompose_factor for LoKr
+    decompose_factor = -1
+
     # Open adapter checkpoint
     with safetensors.safe_open(args.adapter_path, framework="pt", device="cpu") as f:
         # Extract information about adapter structure
         metadata = f.metadata()
 
+        # It may be difficult to determine rank for LoKr adapters
+        # If checkpoint was trained with large rank it may not be utilized during weights creation at all
+        # So we need to get it from checkpoint metadata (along with decompose_factor)
+        rank, conv_rank = None, None
+        if metadata is not None:
+            rank = metadata.get("ss_network_dim", None)
+            rank = int(rank) if rank else None
+            if "ss_network_args" in metadata:
+                network_args = json.loads(metadata["ss_network_args"])
+                conv_rank = network_args.get("conv_dim", None)
+                conv_rank = int(conv_rank) if conv_rank else rank
+                decompose_factor = network_args.get("factor", -1)
+                decompose_factor = int(decompose_factor)
+
         # Detect adapter type based on keys
         adapter_type = detect_adapter_type(f.keys())
         adapter_info_cls = {
             PeftType.LORA: LoRAInfo,
             PeftType.LOHA: LoHaInfo,
+            PeftType.LOKR: LoKrInfo,
         }[adapter_type]
 
         # Iterate through available info and unpack all the values
@@ -255,9 +389,9 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
 
             # Find which model this key belongs to
             if kohya_key.startswith(PREFIX_TEXT_ENCODER):
-                model_type = "text_encoder"
+                model_type, model = "text_encoder", text_encoder
             elif kohya_key.startswith(PREFIX_UNET):
-                model_type = "unet"
+                model_type, model = "unet", unet
             else:
                 raise ValueError(f"Cannot determine model for key: {key}")
 
@@ -266,6 +400,9 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
                 raise ValueError(f"Cannot find corresponding key for diffusers/transformers model: {kohya_key}")
             peft_key = models_keys[kohya_key]
 
+            # Retrieve corresponding layer of model
+            layer = attrgetter(peft_key)(model)
+
             # Create a corresponding adapter info
             if peft_key not in adapter_info[model_type]:
                 adapter_info[model_type][peft_key] = adapter_info_cls(kohya_key=kohya_key, peft_key=peft_key)
@@ -305,6 +442,35 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
                     elif kohya_type == "hada_t2":
                         adapter_info[model_type][peft_key].hada_t2 = tensor
                         adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_t2":
+                adapter_info[model_type][peft_key].lokr_t2 = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_w1":
+                adapter_info[model_type][peft_key].lokr_w1 = tensor
+                if isinstance(layer, nn.Linear) or (
+                    isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1)
+                ):
+                    adapter_info[model_type][peft_key].rank = rank
+                elif isinstance(layer, nn.Conv2d):
+                    adapter_info[model_type][peft_key].rank = conv_rank
+            elif kohya_type == "lokr_w2":
+                adapter_info[model_type][peft_key].lokr_w2 = tensor
+                if isinstance(layer, nn.Linear) or (
+                    isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1)
+                ):
+                    adapter_info[model_type][peft_key].rank = rank
+                elif isinstance(layer, nn.Conv2d):
+                    adapter_info[model_type][peft_key].rank = conv_rank
+            elif kohya_type == "lokr_w1_a":
+                adapter_info[model_type][peft_key].lokr_w1_a = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[1]
+            elif kohya_type == "lokr_w1_b":
+                adapter_info[model_type][peft_key].lokr_w1_b = tensor
+                adapter_info[model_type][peft_key].rank = tensor.shape[0]
+            elif kohya_type == "lokr_w2_a":
+                adapter_info[model_type][peft_key].lokr_w2_a = tensor
+            elif kohya_type == "lokr_w2_b":
+                adapter_info[model_type][peft_key].lokr_w2_b = tensor
             else:
                 raise ValueError(f"Unknown weight name in key: {key} - {kohya_type}")
 
@@ -312,11 +478,12 @@ def detect_adapter_type(keys: List[str]) -> PeftType:
     construct_config_fn = {
         PeftType.LORA: construct_peft_loraconfig,
         PeftType.LOHA: construct_peft_lohaconfig,
+        PeftType.LOKR: construct_peft_lokrconfig,
     }[adapter_type]
 
     # Process each model sequentially
     for model, model_name in [(text_encoder, "text_encoder"), (unet, "unet")]:
-        config = construct_config_fn(adapter_info[model_name])
+        config = construct_config_fn(adapter_info[model_name], decompose_factor=decompose_factor)
 
         # Output warning for LoHa with use_effective_conv2d
         if (
diff --git a/examples/stable_diffusion/train_dreambooth_loha.py b/examples/stable_diffusion/train_dreambooth.py
similarity index 84%
rename from examples/stable_diffusion/train_dreambooth_loha.py
rename to examples/stable_diffusion/train_dreambooth.py
index 944a8394b6..6fc3a30fc6 100644
--- a/examples/stable_diffusion/train_dreambooth_loha.py
+++ b/examples/stable_diffusion/train_dreambooth.py
@@ -8,7 +8,7 @@
 import threading
 import warnings
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import datasets
 import diffusers
@@ -38,7 +38,7 @@
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PretrainedConfig
 
-from peft import LoHaConfig, get_peft_model
+from peft import LoHaConfig, LoKrConfig, LoraConfig, get_peft_model
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -85,6 +85,86 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
         raise ValueError(f"{model_class} is not supported.")
 
 
+def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig]:
+    if args.adapter == "full":
+        raise ValueError("Cannot create unet adapter config for full parameter")
+
+    if args.adapter == "lora":
+        config = LoraConfig(
+            r=args.unet_r,
+            lora_alpha=args.unet_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            lora_dropout=args.unet_dropout,
+            bias=args.unet_bias,
+            init_lora_weights=True,
+        )
+    elif args.adapter == "loha":
+        config = LoHaConfig(
+            r=args.unet_r,
+            alpha=args.unet_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            rank_dropout=args.unet_rank_dropout,
+            module_dropout=args.unet_module_dropout,
+            use_effective_conv2d=args.unet_use_effective_conv2d,
+            init_weights=True,
+        )
+    elif args.adapter == "lokr":
+        config = LoKrConfig(
+            r=args.unet_r,
+            alpha=args.unet_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            rank_dropout=args.unet_rank_dropout,
+            module_dropout=args.unet_module_dropout,
+            use_effective_conv2d=args.unet_use_effective_conv2d,
+            decompose_both=args.unet_decompose_both,
+            decompose_factor=args.unet_decompose_factor,
+            init_weights=True,
+        )
+    else:
+        raise ValueError(f"Unknown adapter type {args.adapter}")
+
+    return config
+
+
+def create_text_encoder_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig]:
+    if args.adapter == "full":
+        raise ValueError("Cannot create text_encoder adapter config for full parameter")
+
+    if args.adapter == "lora":
+        config = LoraConfig(
+            r=args.te_r,
+            lora_alpha=args.te_alpha,
+            target_modules=TEXT_ENCODER_TARGET_MODULES,
+            lora_dropout=args.te_dropout,
+            bias=args.te_bias,
+            init_lora_weights=True,
+        )
+    elif args.adapter == "loha":
+        config = LoHaConfig(
+            r=args.te_r,
+            alpha=args.te_alpha,
+            target_modules=TEXT_ENCODER_TARGET_MODULES,
+            rank_dropout=args.te_rank_dropout,
+            module_dropout=args.te_module_dropout,
+            init_weights=True,
+        )
+    elif args.adapter == "lokr":
+        config = LoKrConfig(
+            r=args.te_r,
+            alpha=args.te_alpha,
+            target_modules=TEXT_ENCODER_TARGET_MODULES,
+            rank_dropout=args.te_rank_dropout,
+            module_dropout=args.te_module_dropout,
+            decompose_both=args.te_decompose_both,
+            decompose_factor=args.te_decompose_factor,
+            init_weights=True,
+        )
+    else:
+        raise ValueError(f"Unknown adapter type {args.adapter}")
+
+    return config
+
+
 def parse_args(input_args=None):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -192,41 +272,6 @@ def parse_args(input_args=None):
     )
     parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
 
-    # loha args
-    parser.add_argument("--use_loha", action="store_true", help="Whether to use LoHa for parameter efficient tuning")
-    parser.add_argument("--r", type=int, default=8, help="LoHa rank, only used if use_loha is True")
-    parser.add_argument("--alpha", type=int, default=32, help="LoHa alpha, only used if use_loha is True")
-    parser.add_argument("--rank_dropout", type=float, default=0.0, help="LoHa dropout for rank")
-    parser.add_argument("--module_dropout", type=float, default=0.0, help="LoHa dropout for disabling module at all")
-    parser.add_argument(
-        "--use_effective_conv2d",
-        action="store_true",
-        help="Use parameter effective decomposition for Conv2d 3x3 with ksize > 1",
-    )
-    parser.add_argument(
-        "--loha_text_encoder_r",
-        type=int,
-        default=8,
-        help="LoHa rank for text encoder, only used if `use_loha` and `train_text_encoder` are True",
-    )
-    parser.add_argument(
-        "--loha_text_encoder_alpha",
-        type=int,
-        default=32,
-        help="LoHa alpha for text encoder, only used if `use_loha` and `train_text_encoder` are True",
-    )
-    parser.add_argument(
-        "--loha_text_encoder_rank_dropout",
-        type=float,
-        default=0.0,
-        help="LoHa dropout for text encoder for rank, only used if `use_loha` and `train_text_encoder` are True",
-    )
-    parser.add_argument(
-        "--loha_text_encoder_module_dropout",
-        type=float,
-        default=0.0,
-        help="LoHa dropout for text encoder for modules, only used if `use_loha` and `train_text_encoder` are True",
-    )
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -381,6 +426,132 @@ def parse_args(input_args=None):
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
 
+    # Adapter arguments
+    subparsers = parser.add_subparsers(dest="adapter")
+
+    # Dummy subparser to train whole model
+    subparsers.add_parser("full", help="Train full model without adapters")
+
+    # LoRA adapter
+    lora = subparsers.add_parser("lora", help="Use LoRA adapter")
+    lora.add_argument("--unet_r", type=int, default=8, help="LoRA rank for unet")
+    lora.add_argument("--unet_alpha", type=int, default=8, help="LoRA alpha for unet")
+    lora.add_argument("--unet_dropout", type=float, default=0.0, help="LoRA dropout probability for unet")
+    lora.add_argument(
+        "--unet_bias",
+        type=str,
+        default="none",
+        help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only'",
+    )
+    lora.add_argument(
+        "--te_r", type=int, default=8, help="LoRA rank for text_encoder, only used if `train_text_encoder` is True"
+    )
+    lora.add_argument(
+        "--te_alpha",
+        type=int,
+        default=8,
+        help="LoRA alpha for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lora.add_argument(
+        "--te_dropout",
+        type=float,
+        default=0.0,
+        help="LoRA dropout probability for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lora.add_argument(
+        "--te_bias",
+        type=str,
+        default="none",
+        help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only', only used if `train_text_encoder` is True",
+    )
+
+    # LoHa adapter
+    loha = subparsers.add_parser("loha", help="Use LoHa adapter")
+    loha.add_argument("--unet_r", type=int, default=8, help="LoHa rank for unet")
+    loha.add_argument("--unet_alpha", type=int, default=8, help="LoHa alpha for unet")
+    loha.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoHa rank_dropout probability for unet")
+    loha.add_argument(
+        "--unet_module_dropout", type=float, default=0.0, help="LoHa module_dropout probability for unet"
+    )
+    loha.add_argument(
+        "--unet_use_effective_conv2d",
+        action="store_true",
+        help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1",
+    )
+    loha.add_argument(
+        "--te_r", type=int, default=8, help="LoHa rank for text_encoder, only used if `train_text_encoder` is True"
+    )
+    loha.add_argument(
+        "--te_alpha",
+        type=int,
+        default=8,
+        help="LoHa alpha for text_encoder, only used if `train_text_encoder` is True",
+    )
+    loha.add_argument(
+        "--te_rank_dropout",
+        type=float,
+        default=0.0,
+        help="LoHa rank_dropout probability for text_encoder, only used if `train_text_encoder` is True",
+    )
+    loha.add_argument(
+        "--te_module_dropout",
+        type=float,
+        default=0.0,
+        help="LoHa module_dropout probability for text_encoder, only used if `train_text_encoder` is True",
+    )
+
+    # LoKr adapter
+    lokr = subparsers.add_parser("lokr", help="Use LoKr adapter")
+    lokr.add_argument("--unet_r", type=int, default=8, help="LoKr rank for unet")
+    lokr.add_argument("--unet_alpha", type=int, default=8, help="LoKr alpha for unet")
+    lokr.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoKr rank_dropout probability for unet")
+    lokr.add_argument(
+        "--unet_module_dropout", type=float, default=0.0, help="LoKr module_dropout probability for unet"
+    )
+    lokr.add_argument(
+        "--unet_use_effective_conv2d",
+        action="store_true",
+        help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1",
+    )
+    lokr.add_argument(
+        "--unet_decompose_both", action="store_true", help="Decompose left matrix in kronecker product for unet"
+    )
+    lokr.add_argument(
+        "--unet_decompose_factor", type=int, default=-1, help="Decompose factor in kronecker product for unet"
+    )
+    lokr.add_argument(
+        "--te_r", type=int, default=8, help="LoKr rank for text_encoder, only used if `train_text_encoder` is True"
+    )
+    lokr.add_argument(
+        "--te_alpha",
+        type=int,
+        default=8,
+        help="LoKr alpha for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lokr.add_argument(
+        "--te_rank_dropout",
+        type=float,
+        default=0.0,
+        help="LoKr rank_dropout probability for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lokr.add_argument(
+        "--te_module_dropout",
+        type=float,
+        default=0.0,
+        help="LoKr module_dropout probability for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lokr.add_argument(
+        "--te_decompose_both",
+        action="store_true",
+        help="Decompose left matrix in kronecker product for text_encoder, only used if `train_text_encoder` is True",
+    )
+    lokr.add_argument(
+        "--te_decompose_factor",
+        type=int,
+        default=-1,
+        help="Decompose factor in kronecker product for text_encoder, only used if `train_text_encoder` is True",
+    )
+
     if input_args is not None:
         args = parser.parse_args(input_args)
     else:
@@ -723,16 +894,8 @@ def main(args):
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
     )
 
-    if args.use_loha:
-        config = LoHaConfig(
-            r=args.r,
-            alpha=args.alpha,
-            target_modules=UNET_TARGET_MODULES,
-            rank_dropout=args.rank_dropout,
-            module_dropout=args.module_dropout,
-            use_effective_conv2d=args.use_effective_conv2d,
-            init_weights=True,
-        )
+    if args.adapter != "full":
+        config = create_unet_adapter_config(args)
         unet = get_peft_model(unet, config)
         unet.print_trainable_parameters()
         print(unet)
@@ -740,15 +903,8 @@ def main(args):
     vae.requires_grad_(False)
     if not args.train_text_encoder:
         text_encoder.requires_grad_(False)
-    elif args.train_text_encoder and args.use_loha:
-        config = LoHaConfig(
-            r=args.loha_text_encoder_r,
-            alpha=args.loha_text_encoder_alpha,
-            target_modules=TEXT_ENCODER_TARGET_MODULES,
-            rank_dropout=args.loha_text_encoder_rank_dropout,
-            module_dropout=args.loha_text_encoder_module_dropout,
-            init_weights=True,
-        )
+    elif args.train_text_encoder and args.adapter != "full":
+        config = create_text_encoder_adapter_config(args)
         text_encoder = get_peft_model(text_encoder, config)
         text_encoder.print_trainable_parameters()
         print(text_encoder)
@@ -761,7 +917,7 @@ def main(args):
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
-        if args.train_text_encoder and not args.use_loha:
+        if args.train_text_encoder and not args.adapter != "full":
             text_encoder.gradient_checkpointing_enable()
 
     # Enable TF32 for faster training on Ampere GPUs,
@@ -1018,6 +1174,10 @@ def main(args):
                     pipeline = pipeline.to(accelerator.device)
                     pipeline.set_progress_bar_config(disable=True)
 
+                    # Set evaliation mode
+                    pipeline.unet.eval()
+                    pipeline.text_encoder.eval()
+
                     # run inference
                     if args.seed is not None:
                         generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
@@ -1044,6 +1204,10 @@ def main(args):
                                 }
                             )
 
+                    # Set evaliation mode
+                    pipeline.unet.train()
+                    pipeline.text_encoder.train()
+
                     del pipeline
                     torch.cuda.empty_cache()
 
@@ -1071,7 +1235,7 @@ def main(args):
     # Create the pipeline using using the trained modules and save it.
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        if args.use_loha:
+        if args.adapter != "full":
             unwarpped_unet = accelerator.unwrap_model(unet)
             unwarpped_unet.save_pretrained(
                 os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet)
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index a22fc87a08..53ba2bd568 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -51,6 +51,8 @@
     LoraModel,
     LoHaConfig,
     LoHaModel,
+    LoKrConfig,
+    LoKrModel,
     IA3Config,
     IA3Model,
     AdaLoraConfig,
diff --git a/src/peft/mapping.py b/src/peft/mapping.py
index ecb12be37c..f69e89ec3e 100644
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@@ -37,6 +37,8 @@
     IA3Model,
     LoHaConfig,
     LoHaModel,
+    LoKrConfig,
+    LoKrModel,
     LoraConfig,
     LoraModel,
     MultitaskPromptTuningConfig,
@@ -67,6 +69,7 @@
     "P_TUNING": PromptEncoderConfig,
     "LORA": LoraConfig,
     "LOHA": LoHaConfig,
+    "LOKR": LoKrConfig,
     "ADALORA": AdaLoraConfig,
     "IA3": IA3Config,
     "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
@@ -75,6 +78,7 @@
 PEFT_TYPE_TO_TUNER_MAPPING = {
     "LORA": LoraModel,
     "LOHA": LoHaModel,
+    "LOKR": LoKrModel,
     "ADALORA": AdaLoraModel,
     "IA3": IA3Model,
 }
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index c36456b019..64e70f2ba7 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -40,6 +40,7 @@
     AdaptionPromptModel,
     IA3Model,
     LoHaModel,
+    LoKrModel,
     LoraModel,
     MultitaskPromptEmbedding,
     PrefixEncoder,
@@ -67,6 +68,7 @@
 PEFT_TYPE_TO_MODEL_MAPPING = {
     PeftType.LORA: LoraModel,
     PeftType.LOHA: LoHaModel,
+    PeftType.LOKR: LoKrModel,
     PeftType.PROMPT_TUNING: PromptEmbedding,
     PeftType.P_TUNING: PromptEncoder,
     PeftType.PREFIX_TUNING: PrefixEncoder,
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index 20f0bb2b31..dd4c94b947 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -20,6 +20,7 @@
 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
 from .lora import LoraConfig, LoraModel
 from .loha import LoHaConfig, LoHaModel
+from .lokr import LoKrConfig, LoKrModel
 from .ia3 import IA3Config, IA3Model
 from .adalora import AdaLoraConfig, AdaLoraModel
 from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index 18a10c88a3..b4ff69cc64 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -43,10 +43,6 @@ def __init__(
         self.out_features = out_features
         self.is_feedforward = is_feedforward
 
-    @property
-    def merged(self) -> bool:
-        return bool(self.merged_adapters)
-
     def update_layer(self, adapter_name, init_ia3_weights):
         # Actual trainable parameters
         if self.is_feedforward:
diff --git a/src/peft/tuners/loha/config.py b/src/peft/tuners/loha/config.py
index 9081883461..7c0f0c81ef 100644
--- a/src/peft/tuners/loha/config.py
+++ b/src/peft/tuners/loha/config.py
@@ -16,12 +16,12 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
 
-from peft.config import PeftConfig
+from peft.tuners.lycoris_utils import LycorisConfig
 from peft.utils import PeftType
 
 
 @dataclass
-class LoHaConfig(PeftConfig):
+class LoHaConfig(LycorisConfig):
     """
     This is the configuration class to store the configuration of a [`LoHaModel`].
 
@@ -92,24 +92,6 @@ class LoHaConfig(PeftConfig):
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
         },
     )
-    rank_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
-            )
-        },
-    )
-    alpha_pattern: Optional[dict] = field(
-        default_factory=dict,
-        metadata={
-            "help": (
-                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
-                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
-            )
-        },
-    )
     modules_to_save: Optional[List[str]] = field(
         default=None,
         metadata={
diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py
index 250fa4de59..26f57ac681 100644
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@@ -14,57 +14,36 @@
 # limitations under the License.
 
 import math
-import warnings
-from typing import Optional, Tuple, Union
+from typing import Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.tuners.lycoris_utils import LycorisLayer
 
 
-class LoHaLayer(BaseTunerLayer, nn.Module):
+class LoHaLayer(LycorisLayer, nn.Module):
     # List all names of layers that may contain adapter weights
     adapter_layer_names = ["hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2"]
 
     def __init__(self):
+        LycorisLayer.__init__(self)
         super(nn.Module, self).__init__()
 
         # LoHa info
-        self.r = {}
-        self.alpha = {}
-        self.scaling = {}
         self.hada_w1_a = nn.ParameterDict({})
         self.hada_w1_b = nn.ParameterDict({})
         self.hada_w2_a = nn.ParameterDict({})
         self.hada_w2_b = nn.ParameterDict({})
         self.hada_t1 = nn.ParameterDict({})
         self.hada_t2 = nn.ParameterDict({})
-        self.rank_dropout = {}
-        self.module_dropout = {}
-
-        # Tuner info
-        self._disable_adapters = False
-        self.merged_adapters = []
 
     @property
-    def merged(self) -> bool:
-        return bool(self.merged_adapters)
-
-    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
-        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
-        # model weights. The implementation is inspired by
-        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
-        # directly.
-        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
-        # omitting important logic inside that __init__.
-        kwargs = kwargs.copy()
-        final_device = kwargs.pop("device", "cpu")
-        cls.__init__(self, *args, device="meta", **kwargs)
-        self.to_empty(device=final_device)
-
-    def create_loha_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]):
+    def _available_adapters(self) -> Set[str]:
+        return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]):
         # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75
         if len(shape) == 4:
             self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
@@ -81,7 +60,7 @@ def create_loha_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ..
             self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
             self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
 
-    def reset_loha_parameters(self, adapter_name: str):
+    def reset_adapter_parameters(self, adapter_name: str):
         # Original implementation performs initialization with normal distribution
         # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
 
@@ -104,19 +83,20 @@ def update_layer(
         rank_dropout: float,
         module_dropout: float,
         init_weights: bool,
-        use_effective_conv2d: bool,
+        use_effective_conv2d: bool = False,
         **kwargs,
     ) -> None:
         """Internal function to create loha adapter
 
         Args:
-            shape (`Tuple[int, ...]`): Shape of weights to produce
-            adapter_name (`str`): Name for the adapter to add
-            r (`int`): Rank for the added adapter
-            alpha (`float`): Alpha for the added adapter
-            rank_dropout (`float`): The dropout probability for rank dimension during training
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training.
             module_dropout (`float`): The dropout probability for disabling adapter during training.
-            init_weights (`bool`): Whether to initialize weights
+            init_weights (`bool`): Whether to initialize weights.
+            use_effective_conv2d (`bool`, *optional*, defaults to `False`):
+                Use parameter effective decomposition for Conv2d with ksize > 1.
         """
 
         self.r[adapter_name] = r
@@ -135,14 +115,14 @@ def update_layer(
             else:
                 shape = (self.out_channels, self.in_channels * self.kernel_size[0] * self.kernel_size[1])
         else:
-            raise NotImplementedError(f"LoHa is not implemented for {type(self).__name__} layer")
+            raise TypeError(f"LoHa is not implemented for {type(self).__name__} layer")
 
         # Create weights with provided shape
-        self.create_loha_parameters(adapter_name, r, shape)
+        self.create_adapter_parameters(adapter_name, r, shape)
 
         # Initialize weights
         if init_weights:
-            self.reset_loha_parameters(adapter_name)
+            self.reset_adapter_parameters(adapter_name)
 
         # Move new weights to device
         weight = getattr(self, "weight", None)
@@ -190,72 +170,6 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
 
         return weight
 
-    def merge(self) -> None:
-        if self.merged:
-            warnings.warn(
-                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
-                f"You are now additionally merging {','.join(self.active_adapters)}."
-            )
-        for active_adapter in self.active_adapters:
-            if active_adapter in self.hada_w1_a.keys():
-                self.weight.data += self.get_delta_weight(active_adapter)
-                self.merged_adapters.append(active_adapter)
-
-    def unmerge(self) -> None:
-        if not self.merged:
-            warnings.warn("Already unmerged. Nothing to do.")
-            return
-        while len(self.merged_adapters) > 0:
-            active_adapter = self.merged_adapters.pop()
-            if active_adapter in self.hada_w1_a.keys():
-                self.weight.data -= self.get_delta_weight(active_adapter)
-
-    def _op(self, x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        previous_dtype = x.dtype
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self._op(x, self.weight)
-        elif self.merged:
-            result = self._op(x, self.weight)
-        else:
-            # Get base weights
-            weight = self.weight.data
-
-            # Execute all the adapters
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self.hada_w1_a.keys():
-                    continue
-
-                module_dropout = self.module_dropout[active_adapter]
-
-                # Modify current execution weights
-                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
-                    weight = weight + self.get_delta_weight(active_adapter)
-
-            # Perform actual operation
-            result = self._op(x, weight)
-
-        result = result.to(previous_dtype)
-        return result
-
-    def scale_layer(self, scale_factor: float) -> None:
-        if scale_factor != 1:
-            for active_adapter in self.active_adapters:
-                alpha = self.alpha[active_adapter]
-                r = self.r[active_adapter]
-                self.scaling[active_adapter] = (alpha / r) * scale_factor
-
-    def unscale_layer(self) -> None:
-        for active_adapter in self.active_adapters:
-            alpha = self.alpha[active_adapter]
-            r = self.r[active_adapter]
-            self.scaling[active_adapter] = alpha / r
-
 
 class Linear(LoHaLayer, nn.Linear):
     """LoHa implemented in Linear layer"""
diff --git a/src/peft/tuners/loha/model.py b/src/peft/tuners/loha/model.py
index c9403b76eb..92d5b887ef 100644
--- a/src/peft/tuners/loha/model.py
+++ b/src/peft/tuners/loha/model.py
@@ -13,25 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-import warnings
-from itertools import chain
-from typing import Union
+from typing import Dict, Type
 
 import torch
-from torch import nn
-from tqdm import tqdm
-
-from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
-from peft.utils import (
-    ModulesToSaveWrapper,
-    _get_submodules,
-)
 
+from ..lycoris_utils import LycorisTuner
 from .layer import Conv2d, Linear, LoHaLayer
 
 
-class LoHaModel(BaseTuner):
+class LoHaModel(LycorisTuner):
     """
     Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in
     https://arxiv.org/abs/2108.06098 Current implementation heavily borrows from
@@ -87,181 +77,8 @@ class LoHaModel(BaseTuner):
         - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model.
     """
 
-    def __init__(self, model, config, adapter_name):
-        super().__init__(model, config, adapter_name)
-
-    def __getattr__(self, name: str):
-        """Forward missing attributes to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to nn.Module's logic
-        except AttributeError:
-            return getattr(self.model, name)
-
-    def _set_adapter_layers(self, enabled=True):
-        for module in self.model.modules():
-            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
-                module.enable_adapters(enabled)
-
-    def enable_adapter_layers(self):
-        self._set_adapter_layers(enabled=True)
-
-    def disable_adapter_layers(self):
-        self._set_adapter_layers(enabled=False)
-
-    def set_adapter(self, adapter_name):
-        for module in self.model.modules():
-            if isinstance(module, LoHaLayer):
-                if module.merged:
-                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
-                    module.unmerge()
-                module.set_adapter(adapter_name)
-
-    @staticmethod
-    def _prepare_adapter_config(peft_config, model_config):
-        if peft_config.target_modules is None:
-            raise ValueError("Please specify `target_modules` in `peft_config`")
-        return peft_config
-
-    @staticmethod
-    def _check_target_module_exists(loha_config, key):
-        return check_target_module_exists(loha_config, key)
-
-    def _create_and_replace(
-        self,
-        loha_config,
-        adapter_name: str,
-        target: Union[LoHaLayer, nn.Module],
-        target_name,
-        parent,
-        current_key,
-        **optional_kwargs,
-    ):
-        """
-        A private method to create and replace the target module with the adapter module.
-        """
-
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(chain(loha_config.rank_pattern.keys(), loha_config.alpha_pattern.keys()))
-        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
-
-        r = loha_config.rank_pattern.get(target_name_key, loha_config.r)
-        alpha = loha_config.alpha_pattern.get(target_name_key, loha_config.alpha)
-
-        kwargs = {
-            "r": r,
-            "alpha": alpha,
-            "rank_dropout": loha_config.rank_dropout,
-            "module_dropout": loha_config.module_dropout,
-            "use_effective_conv2d": loha_config.use_effective_conv2d,
-            "init_weights": loha_config.init_weights,
-        }
-
-        if isinstance(target, LoHaLayer):
-            target.update_layer(adapter_name, **kwargs)
-        else:
-            new_module = self._create_new_module(loha_config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
-
-    @staticmethod
-    def _create_new_module(loha_config, adapter_name, target, **kwargs) -> LoHaLayer:
-        if isinstance(target, torch.nn.Conv2d):
-            new_module = Conv2d(
-                target.in_channels,
-                target.out_channels,
-                target.weight.size()[2:],
-                stride=target.stride,
-                padding=target.padding,
-                dilation=target.dilation,
-                groups=target.groups,
-                bias=target.bias is not None,
-                padding_mode=target.padding_mode,
-                device=target.weight.device,
-                dtype=target.weight.dtype,
-                adapter_name=adapter_name,
-                **kwargs,
-            )
-        elif isinstance(target, torch.nn.Linear):
-            new_module = Linear(
-                target.in_features,
-                target.out_features,
-                bias=target.bias is not None,
-                device=target.weight.device,
-                dtype=target.weight.dtype,
-                adapter_name=adapter_name,
-                **kwargs,
-            )
-        else:
-            raise ValueError(
-                "Target module not found, currently only adapters for nn.Linear and nn.Conv2d are supported"
-            )
-        return new_module
-
-    @staticmethod
-    def _replace_module(parent, child_name, new_module, child):
-        setattr(parent, child_name, new_module)
-        # It's not necessary to set requires_grad here, as that is handled by
-        # _mark_only_adapters_as_trainable
-        new_module.weight = child.weight
-        if hasattr(child, "bias"):
-            new_module.bias = child.bias
-
-        if getattr(child, "state", None) is not None:
-            new_module.state = child.state
-            new_module.to(child.weight.device)
-
-        # dispatch to correct device
-        for name, module in new_module.named_modules():
-            if "hada_" in name:
-                module.to(child.weight.device)
-
-    def _mark_only_adapters_as_trainable(self) -> None:
-        for n, p in self.model.named_parameters():
-            if "hada_" not in n:
-                p.requires_grad = False
-
-    def merge_and_unload(self, progressbar: bool = False):
-        return self._unload_and_optionally_merge(progressbar=progressbar)
-
-    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False):
-        if merge:
-            if getattr(self.model, "quantization_method", None) == "gptq":
-                raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
-
-        key_list = [key for key, _ in self.model.named_modules() if "hada" not in key]
-        desc = "Unloading " + ("and merging " if merge else "") + "model"
-        for key in tqdm(key_list, disable=not progressbar, desc=desc):
-            try:
-                parent, target, target_name = _get_submodules(self.model, key)
-            except AttributeError:
-                continue
-            if isinstance(target, LoHaLayer):
-                if isinstance(target, nn.Conv2d):
-                    new_module = torch.nn.Conv2d(
-                        target.in_channels,
-                        target.out_channels,
-                        kernel_size=target.kernel_size,
-                        stride=target.stride,
-                        padding=target.padding,
-                        dilation=target.dilation,
-                    )
-                elif isinstance(target, nn.Linear):
-                    bias = target.bias is not None
-                    new_module = torch.nn.Linear(
-                        target.in_features,
-                        target.out_features,
-                        bias=bias,
-                        device=target.weight.device,
-                    )
-                else:
-                    raise ValueError(
-                        "Cannot convert current module to torch module, currently only adapters for nn.Linear and nn.Conv2d are supported"
-                    )
-                if merge:
-                    target.merge()
-                self._replace_module(parent, target_name, new_module, target)
-
-            # save any additional trainable modules part of `modules_to_save`
-            if isinstance(target, ModulesToSaveWrapper):
-                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
-
-        return self.model
+    prefix: str = "hada_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoHaLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
diff --git a/src/peft/tuners/lokr/__init__.py b/src/peft/tuners/lokr/__init__.py
new file mode 100644
index 0000000000..b137f22c96
--- /dev/null
+++ b/src/peft/tuners/lokr/__init__.py
@@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LoKrConfig
+from .layer import Conv2d, Linear, LoKrLayer
+from .model import LoKrModel
+
+
+__all__ = ["LoKrConfig", "LoKrModel", "Conv2d", "Linear", "LoKrLayer"]
diff --git a/src/peft/tuners/lokr/config.py b/src/peft/tuners/lokr/config.py
new file mode 100644
index 0000000000..d99b22aa76
--- /dev/null
+++ b/src/peft/tuners/lokr/config.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class LoKrConfig(LycorisConfig):
+    """
+    Configuration class of [`LoKrModel`].
+
+    Args:
+        r (`int`): LoKr rank.
+        alpha (`int`): The alpha parameter for LoKr scaling.
+        rank_dropout (`int`): The dropout probability for rank dimension during training.
+        module_dropout (`int`): The dropout probability for disabling LoKr modules during training.
+        use_effective_conv2d (`bool`):
+            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
+        decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
+        decompose_factor (`int`): Kronecker product decomposition factor.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply LoKr to.
+        init_weights (`bool`): Whether to perform initialization of LoKr weights.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the LoKr transformations on
+            the layer indexes that are specified in this list. If a single integer is passed, it will apply the LoKr
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `alpha`.
+        modules_to_save (`List[str]`): The names of modules to be set as trainable except LoKr parameters.
+    """
+
+    r: int = field(default=8, metadata={"help": "LoKr rank"})
+    alpha: int = field(default=8, metadata={"help": "LoKr alpha"})
+    rank_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
+    )
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling LoKr modules during training"}
+    )
+    use_effective_conv2d: bool = field(
+        default=False,
+        metadata={
+            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
+        },
+    )
+    decompose_both: bool = field(
+        default=False,
+        metadata={"help": "Perform rank decomposition of left kronecker product matrix."},
+    )
+    decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with LoKr."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the LoKr layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LOKR
diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py
new file mode 100644
index 0000000000..9b01ecf96f
--- /dev/null
+++ b/src/peft/tuners/lokr/layer.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft.tuners.lycoris_utils import LycorisLayer
+
+
+class LoKrLayer(LycorisLayer, nn.Module):
+    # List all names of layers that may contain adapter weights
+    adapter_layer_names = [
+        "lokr_w1",
+        "lokr_w1_a",
+        "lokr_w1_b",
+        "lokr_w2",
+        "lokr_w2_a",
+        "lokr_w2_b",
+        "lokr_t2",
+    ]
+
+    def __init__(self):
+        LycorisLayer.__init__(self)
+        super(nn.Module, self).__init__()
+
+        # LoKr info
+        self.lokr_w1 = nn.ParameterDict({})
+        self.lokr_w1_a = nn.ParameterDict({})
+        self.lokr_w1_b = nn.ParameterDict({})
+        self.lokr_w2 = nn.ParameterDict({})
+        self.lokr_w2_a = nn.ParameterDict({})
+        self.lokr_w2_b = nn.ParameterDict({})
+        self.lokr_t2 = nn.ParameterDict({})
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {
+            *self.lokr_w1,
+            *self.lokr_w1_a,
+            *self.lokr_w1_b,
+            *self.lokr_w2,
+            *self.lokr_w2_a,
+            *self.lokr_w2_b,
+            *self.lokr_t2,
+        }
+
+    def create_adapter_parameters(
+        self,
+        adapter_name: str,
+        r: int,
+        shape,
+        use_w1: bool,
+        use_w2: bool,
+        use_effective_conv2d: bool,
+    ):
+        if use_w1:
+            self.lokr_w1[adapter_name] = nn.Parameter(torch.empty(shape[0][0], shape[1][0]))
+        else:
+            self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r))
+            self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0]))
+
+        if len(shape) == 4:
+            # Conv2d
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:]))
+            elif use_effective_conv2d:
+                self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1]))  # b, 1-mode
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))  # d, 2-mode
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3]))
+        else:
+            # Linear
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1]))
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        if adapter_name in self.lokr_w1:
+            nn.init.zeros_(self.lokr_w1[adapter_name])
+        else:
+            nn.init.zeros_(self.lokr_w1_a[adapter_name])
+            nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_w2:
+            nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_t2:
+            nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        alpha: float,
+        rank_dropout: float,
+        module_dropout: float,
+        init_weights: bool,
+        use_effective_conv2d: bool,
+        decompose_both: bool,
+        decompose_factor: int,
+        **kwargs,
+    ) -> None:
+        """Internal function to create lokr adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize adapter weights.
+            use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1.
+            decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
+            decompose_factor (`int`): Kronecker product decomposition factor.
+        """
+
+        self.r[adapter_name] = r
+        self.alpha[adapter_name] = alpha
+        self.scaling[adapter_name] = alpha / r
+        self.rank_dropout[adapter_name] = rank_dropout
+        self.module_dropout[adapter_name] = module_dropout
+
+        # Determine shape of LoKr weights
+        if isinstance(self, nn.Linear):
+            in_dim, out_dim = self.in_features, self.out_features
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n))  # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2)
+            use_effective_conv2d = False
+        elif isinstance(self, nn.Conv2d):
+            in_dim, out_dim = self.in_channels, self.out_channels
+            k_size = self.kernel_size
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n), *k_size)  # ((a, b), (c, d), *k_size)
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = r >= max(shape[0][1], shape[1][1]) / 2
+            use_effective_conv2d = use_effective_conv2d and self.kernel_size != (1, 1)
+        else:
+            raise TypeError(f"LoKr is not implemented for {type(self).__name__} layer")
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224
+        if adapter_name in self.lokr_w1:
+            w1 = self.lokr_w1[adapter_name]
+        else:
+            w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name]
+
+        if adapter_name in self.lokr_w2:
+            w2 = self.lokr_w2[adapter_name]
+        elif adapter_name in self.lokr_t2:
+            w2 = make_weight_cp(self.lokr_t2[adapter_name], self.lokr_w2_a[adapter_name], self.lokr_w2_b[adapter_name])
+        else:
+            w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name]
+
+        # Make weights with Kronecker product
+        weight = make_kron(w1, w2)
+        weight = weight.reshape(self.weight.shape)
+
+        # Perform rank dropout during training - drop rows of addition weights
+        rank_dropout = self.rank_dropout[adapter_name]
+        if self.training and rank_dropout:
+            drop = (torch.rand(weight.size(0)) > rank_dropout).float()
+            drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device)
+            drop /= drop.mean()
+            weight *= drop
+
+        return weight
+
+
+class Linear(LoKrLayer, nn.Linear):
+    """LoKr implemented in Linear layer"""
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(nn.Linear, in_features, out_features, bias, device=device, dtype=dtype)
+
+        LoKrLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, weight, bias=self.bias)
+
+
+class Conv2d(LoKrLayer, nn.Conv2d):
+    """LoKr implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]],
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        use_effective_conv2d: bool = False,
+        **kwargs,
+    ):
+        init_weights = kwargs.pop("init_weights", True)
+        self._init_empty_weights(
+            nn.Conv2d,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+        LoKrLayer.__init__(self)
+
+        # Create adapter and set it active
+        self.update_layer(
+            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+        )
+        self.set_adapter(adapter_name)
+
+    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        return F.conv2d(
+            input,
+            weight,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+
+# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11
+
+
+def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]:
+    """Factorizes the provided number into the product of two numbers
+
+    Args:
+        dimension (`int`): The number that needs to be factorized.
+        factor (`int`, optional):
+            Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the
+            factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the
+            square root of the dimension. Defaults to -1.
+
+    Returns:
+        Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is
+        always less than or equal to the second.
+
+    Example:
+        ```py
+        >>> factorization(256, factor=-1)
+        (16, 16)
+
+        >>> factorization(128, factor=-1)
+        (8, 16)
+
+        >>> factorization(127, factor=-1)
+        (1, 127)
+
+        >>> factorization(128, factor=4)
+        (4, 32)
+        ```
+    """
+
+    if factor > 0 and (dimension % factor) == 0:
+        m = factor
+        n = dimension // factor
+        return m, n
+    if factor == -1:
+        factor = dimension
+    m, n = 1, dimension
+    length = m + n
+    while m < n:
+        new_m = m + 1
+        while dimension % new_m != 0:
+            new_m += 1
+        new_n = dimension // new_m
+        if new_m + new_n > length or new_m > factor:
+            break
+        else:
+            m, n = new_m, new_n
+    if m > n:
+        n, m = m, n
+    return m, n
+
+
+def make_weight_cp(t, wa, wb):
+    rebuild2 = torch.einsum("i j k l, i p, j r -> p r k l", t, wa, wb)  # [c, d, k1, k2]
+    return rebuild2
+
+
+def make_kron(w1, w2, scale=1.0):
+    if len(w2.shape) == 4:
+        w1 = w1.unsqueeze(2).unsqueeze(2)
+    w2 = w2.contiguous()
+    rebuild = torch.kron(w1, w2)
+
+    return rebuild * scale
diff --git a/src/peft/tuners/lokr/model.py b/src/peft/tuners/lokr/model.py
new file mode 100644
index 0000000000..e08b7a7c48
--- /dev/null
+++ b/src/peft/tuners/lokr/model.py
@@ -0,0 +1,85 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Type
+
+import torch
+
+from ..lycoris_utils import LycorisTuner
+from .layer import Conv2d, Linear, LoKrLayer
+
+
+class LoKrModel(LycorisTuner):
+    """
+    Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in
+    https://arxiv.org/abs/2108.06098 and in https://arxiv.org/abs/2309.14859 Current implementation heavily borrows
+    from
+    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`LoKrConfig`]): The configuration of the LoKr model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The LoKr model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import LoKrModel, LoKrConfig
+
+        >>> config_te = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ...     use_effective_conv2d=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default")
+        >>> model.unet = LoKrModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr model.
+    """
+
+    prefix: str = "lokr_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoKrLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index df6083dd8e..0eb2efa2f2 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -46,10 +46,6 @@ def __init__(self, in_features: int, out_features: int, **kwargs):
         self.out_features = out_features
         self.kwargs = kwargs
 
-    @property
-    def merged(self) -> bool:
-        return bool(self.merged_adapters)
-
     def _init_empty_weights(self, cls, *args, **kwargs) -> None:
         # A helper method that allows to initialize the layer of the given class without spending time to initialize the
         # model weights. The implementation is inspired by
diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py
new file mode 100644
index 0000000000..8d3fb7481b
--- /dev/null
+++ b/src/peft/tuners/lycoris_utils.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+from abc import abstractmethod
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Dict, Optional, Set, Type, Union
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from peft.config import PeftConfig
+from peft.utils import (
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+
+
+@dataclass
+class LycorisConfig(PeftConfig):
+    r"""
+    A base config for LyCORIS like adapters
+    """
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+            )
+        },
+    )
+
+
+class LycorisLayer(BaseTunerLayer, nn.Module):
+    r"""
+    A base layer for LyCORIS like adapters
+    """
+
+    def __init__(self):
+        self.r = {}
+        self.alpha = {}
+        self.scaling = {}
+        self.rank_dropout = {}
+        self.module_dropout = {}
+
+        # Tuner info
+        self._disable_adapters = False
+        self.merged_adapters = []
+
+    @property
+    @abstractmethod
+    def _available_adapters(self) -> Set[str]:
+        ...
+
+    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
+        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
+        # model weights. The implementation is inspired by
+        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
+        # directly.
+        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
+        # omitting important logic inside that __init__.
+        kwargs = kwargs.copy()
+        final_device = kwargs.pop("device", "cpu")
+        cls.__init__(self, *args, device="meta", **kwargs)
+        self.to_empty(device=final_device)
+
+    def _op(self, x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs):
+        ...
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self._op(x, self.weight)
+        elif self.merged:
+            result = self._op(x, self.weight)
+        else:
+            # Get base weights
+            weight = self.weight.data
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    weight = weight + self.get_delta_weight(active_adapter)
+
+            # Perform actual operation
+            result = self._op(x, weight)
+
+        result = result.to(previous_dtype)
+        return result
+
+    @abstractmethod
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        ...
+
+    def merge(self) -> None:
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        for active_adapter in self.active_adapters:
+            if active_adapter in self._available_adapters:
+                self.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    @abstractmethod
+    def reset_adapter_parameters(self, adapter_name: str):
+        ...
+
+    def set_scale(self, adapter, scale):
+        if adapter not in self._available_adapters:
+            # Ignore the case where the adapter is not in the layer
+            return
+        self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter]
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            self.scaling[active_adapter] *= scale
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                self.weight.data -= self.get_delta_weight(active_adapter)
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            if scale is None:
+                self.scaling[active_adapter] = self.alpha[active_adapter] / self.r[active_adapter]
+            else:
+                self.scaling[active_adapter] /= scale
+
+    @abstractmethod
+    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs):
+        ...
+
+
+class LycorisTuner(BaseTuner):
+    r"""
+    A base tuner for LyCORIS like adapters
+    """
+
+    prefix: str
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LycorisLayer]]
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    @staticmethod
+    def _check_target_module_exists(config, key):
+        return check_target_module_exists(config, key)
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LycorisLayer, nn.Module],
+        target_name,
+        parent,
+        current_key,
+        **optional_kwargs,
+    ):
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LycorisLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @classmethod
+    def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer:
+        # Find corresponding subtype of provided target module
+        new_module_cls = None
+        for subtype, target_cls in cls.layers_mapping.items():
+            if isinstance(target, subtype):
+                new_module_cls = target_cls
+                break
+
+        # We didn't find corresponding type, so adapter for this layer is not supported
+        if new_module_cls is None:
+            raise ValueError(
+                f"Target module not found, currently only adapters for {', '.join([x.__name__ for x in cls.modules_mapping.keys()])} are supported"
+            )
+
+        if isinstance(target, torch.nn.Conv2d):
+            new_module = new_module_cls(
+                target.in_channels,
+                target.out_channels,
+                target.weight.size()[2:],
+                stride=target.stride,
+                padding=target.padding,
+                dilation=target.dilation,
+                groups=target.groups,
+                bias=target.bias is not None,
+                padding_mode=target.padding_mode,
+                device=target.weight.device,
+                dtype=target.weight.dtype,
+                adapter_name=adapter_name,
+                **kwargs,
+            )
+        elif isinstance(target, torch.nn.Linear):
+            new_module = new_module_cls(
+                target.in_features,
+                target.out_features,
+                bias=target.bias is not None,
+                device=target.weight.device,
+                dtype=target.weight.dtype,
+                adapter_name=adapter_name,
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                "Target module not found, currently only adapters for nn.Linear and nn.Conv2d are supported"
+            )
+
+        return new_module
+
+    def _mark_only_adapters_as_trainable(self) -> None:
+        for n, p in self.model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            raise ValueError("Please specify `target_modules` in `peft_config`")
+        return peft_config
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+        new_module.weight = child.weight
+        if hasattr(child, "bias"):
+            new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                module.to(child.weight.device)
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
+
+        key_list = [key for key, _ in self.model.named_modules() if "hada" not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            if isinstance(target, LycorisLayer):
+                if isinstance(target, nn.Conv2d):
+                    new_module = torch.nn.Conv2d(
+                        target.in_channels,
+                        target.out_channels,
+                        kernel_size=target.kernel_size,
+                        stride=target.stride,
+                        padding=target.padding,
+                        dilation=target.dilation,
+                    )
+                elif isinstance(target, nn.Linear):
+                    bias = target.bias is not None
+                    new_module = torch.nn.Linear(
+                        target.in_features,
+                        target.out_features,
+                        bias=bias,
+                        device=target.weight.device,
+                    )
+                else:
+                    raise ValueError(
+                        "Cannot convert current module to torch module, currently only adapters for nn.Linear and nn.Conv2d are supported"
+                    )
+                if merge:
+                    target.merge()
+                self._replace_module(parent, target_name, new_module, target)
+
+            # save any additional trainable modules part of `modules_to_save`
+            if isinstance(target, ModulesToSaveWrapper):
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        self._set_adapter_layers(enabled=False)
+
+    def merge_and_unload(self, progressbar: bool = False):
+        return self._unload_and_optionally_merge(progressbar=progressbar)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, LycorisLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (`str`): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LycorisLayer):
+                for attr in target.adapter_layer_names:
+                    if adapter_name in getattr(target, attr):
+                        getattr(target, attr).pop(adapter_name)
+                if adapter_name in target.active_adapters:
+                    resetting_active_adapter = (
+                        list(self.peft_config.keys())[0] if len(self.peft_config) > 0 else "default"
+                    )
+                    warnings.warn(
+                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to {resetting_active_adapter}. "
+                    )
+                    target.set_adapter(resetting_active_adapter)
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 10f8754296..9307e5fc12 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -281,12 +281,19 @@ class BaseTunerLayer(ABC):
     # the currently active adapter(s)
     _active_adapter: str | list[str] = "default"
 
+    # List all merged adapters
+    merged_adapters: list[str] = []
+
     def merge(self, *args) -> None:
         raise NotImplementedError
 
     def unmerge(self, *args) -> None:
         raise NotImplementedError
 
+    @property
+    def merged(self) -> bool:
+        return bool(self.merged_adapters)
+
     @property
     def disable_adapters(self) -> bool:
         # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
@@ -349,7 +356,7 @@ def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
     """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config.
 
     Args:
-        config (`LoraConfig` | `LoHaConfig`): A config to match target modules from
+        config (`LoraConfig` | `LycorisConfig`): A config to match target modules from
         key (`str`): A key to search any matches in config
 
     Returns:
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index d073be81c3..29c764a08f 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -29,6 +29,7 @@ class PeftType(str, enum.Enum):
     ADAPTION_PROMPT = "ADAPTION_PROMPT"
     IA3 = "IA3"
     LOHA = "LOHA"
+    LOKR = "LOKR"
 
 
 class TaskType(str, enum.Enum):
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index ff00541121..cd8088e93e 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -75,6 +75,9 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", un
     elif config.peft_type == PeftType.LOHA:
         to_return = {k: state_dict[k] for k in state_dict if "hada_" in k}
 
+    elif config.peft_type == PeftType.LOKR:
+        to_return = {k: state_dict[k] for k in state_dict if "lokr_" in k}
+
     elif config.peft_type == PeftType.ADAPTION_PROMPT:
         to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
     elif config.is_prompt_learning:
@@ -123,13 +126,14 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
     else:
         state_dict = peft_model_state_dict
 
-    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.ADALORA, PeftType.IA3):
+    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3):
         peft_model_state_dict = {}
         parameter_prefix = {
             PeftType.IA3: "ia3_",
             PeftType.LORA: "lora_",
             PeftType.ADALORA: "lora_",
             PeftType.LOHA: "hada_",
+            PeftType.LOKR: "lokr_",
         }[config.peft_type]
         for k, v in state_dict.items():
             if parameter_prefix in k:
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index d3802d08bd..9bd4dec9b6 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -24,7 +24,7 @@
 from torch import nn
 from transformers.pytorch_utils import Conv1D
 
-from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoraConfig, PeftModel, get_peft_model
+from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoKrConfig, LoraConfig, PeftModel, get_peft_model
 from peft.tuners.tuners_utils import BaseTunerLayer
 
 from .testing_common import PeftCommonTester
@@ -141,10 +141,56 @@
             "module_dropout": 0.1,
         },
     ),
+    ("Vanilla MLP 7 LOHA", "MLP", LoHaConfig, {"target_modules": "lin0", "rank_dropout": 0.5}),
     ("Conv2d 1 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"]}),
     ("Conv2d 2 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"]}),
     ("Conv2d 3 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}),
     ("Conv2d 4 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}),
+    # LoKr
+    ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}),
+    ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}),
+    ("Vanilla MLP 3 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin1"]}),
+    ("Vanilla MLP 4 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0", "lin1"]}),
+    ("Vanilla MLP 5 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}),
+    (
+        "Vanilla MLP 6 LOKR",
+        "MLP",
+        LoKrConfig,
+        {
+            "target_modules": ["lin0"],
+            "alpha": 4,
+            "module_dropout": 0.1,
+        },
+    ),
+    ("Vanilla MLP 7 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "rank_dropout": 0.5}),
+    ("Vanilla MLP 8 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "decompose_both": True, "r": 1, "alpha": 1}),
+    ("Conv2d 1 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"]}),
+    ("Conv2d 2 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"]}),
+    ("Conv2d 3 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}),
+    ("Conv2d 4 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}),
+    (
+        "Conv2d 5 LOKR",
+        "Conv2d",
+        LoKrConfig,
+        {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True, "decompose_both": True},
+    ),
+    (
+        "Conv2d 6 LOKR",
+        "Conv2d",
+        LoKrConfig,
+        {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True, "decompose_factor": 4},
+    ),
+    (
+        "Conv2d 7 LOKR",
+        "Conv2d",
+        LoKrConfig,
+        {
+            "target_modules": ["conv2d", "lin0"],
+            "use_effective_conv2d": True,
+            "decompose_both": True,
+            "decompose_factor": 4,
+        },
+    ),
 ]
 
 MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES = [
@@ -211,6 +257,7 @@
     IA3Config: "ia3_",
     LoraConfig: "lora_",
     LoHaConfig: "hada_",
+    LoKrConfig: "lokr_",
 }
 
 
diff --git a/tests/test_stablediffusion.py b/tests/test_stablediffusion.py
index a9a1f5a4ad..830614a7ab 100644
--- a/tests/test_stablediffusion.py
+++ b/tests/test_stablediffusion.py
@@ -64,6 +64,7 @@
 CLASSES_MAPPING = {
     "lora": (LoraConfig, CONFIG_TESTING_KWARGS[0]),
     "loha": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
+    "lokr": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
 }
 
 
@@ -147,7 +148,7 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
             },
-            filter_params_func=lambda tests: [x for x in tests if "loha" not in x[0]],
+            filter_params_func=lambda tests: [x for x in tests if all(s not in x[0] for s in ["loha", "lokr"])],
         )
     )
     def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_cls, config_kwargs):
@@ -176,6 +177,7 @@ def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_c
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "loha_kwargs": {"init_weights": [False]},
+                "lokr_kwargs": {"init_weights": [False]},
             },
         )
     )

From bdeb06b16c204c5b1f44f3081fa01009353220cb Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 31 Oct 2023 16:51:41 +0100
Subject: [PATCH 03/65] [`core`] Fix `use_reentrant` issues (#1036)

* fix use_reentrant issues

* fix

* fixup

* address comments.

* add warnings

* oops

* fix

* quality
---
 src/peft/utils/other.py | 50 ++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 62aedce273..e33c52e21e 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -63,18 +63,29 @@ def starcoder_model_postprocess_past_key_value(past_key_values):
     return tuple(result)
 
 
-def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
+def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None):
     r"""
+    Note this method only works for `transformers` models.
+
     This method wraps the entire protocol for preparing a model before running a training. This includes:
         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
         head to fp32
 
     Args:
-        model, (`transformers.PreTrainedModel`):
+        model (`transformers.PreTrainedModel`):
             The loaded model from `transformers`
+        use_gradient_checkpointing (`bool`, *optional*, defaults to `True`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
+            Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of
+            `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method.
+            Note this is only available in the latest transformers versions (> 4.34.1).
     """
     loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)
     is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq"
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {}
+
     for name, param in model.named_parameters():
         # freeze base model's layers
         param.requires_grad = False
@@ -86,19 +97,36 @@ def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
                 param.data = param.data.to(torch.float32)
 
     if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
-        # For backward compatibility
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
+        # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack
+        if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]:
+            # For backward compatibility
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
 
-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
 
-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
 
-        # enable gradient checkpointing for memory efficiency
-        model.gradient_checkpointing_enable()
+        # To support older transformers versions, check if the model supports gradient_checkpointing_kwargs
+        _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(
+            inspect.signature(model.gradient_checkpointing_enable).parameters
+        )
 
+        if not _supports_gc_kwargs and len(gradient_checkpointing_kwargs) > 0:
+            warnings.warn(
+                "gradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored."
+                " if you want to use that feature, please upgrade to the latest version of transformers.",
+                FutureWarning,
+            )
+
+        gc_enable_kwargs = (
+            {} if not _supports_gc_kwargs else {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs}
+        )
+
+        # enable gradient checkpointing for memory efficiency
+        model.gradient_checkpointing_enable(**gc_enable_kwargs)
     return model
 
 

From 6960076699a060bfa10d64d02bf641faa421b443 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:48:12 +0100
Subject: [PATCH 04/65] [`tests`] Update Dockerfile to use cuda 12.2 (#1050)

* [`tests`] Update Dockerfile to use cuda 12.2

* Update nightly.yml
---
 .github/workflows/nightly.yml | 2 ++
 docker/peft-gpu/Dockerfile    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 57caba1c61..d0c537a947 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -8,6 +8,8 @@ on:
 env:
   RUN_SLOW: "yes"
   IS_GITHUB_CI: "1"
+  # To be able to run tests on CUDA 12.2
+  NVIDIA_DISABLE_REQUIRE: "1"
   SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
 
 
diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
index 375e9a65e3..9b538d218d 100644
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@@ -40,7 +40,7 @@ RUN source activate peft && \
     peft[test]@git+https://github.com/huggingface/peft
 
 # Stage 2
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS build-image
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
 COPY --from=compile-image /opt/conda /opt/conda
 ENV PATH /opt/conda/bin:$PATH
 

From 0ad95fa361effdead52dfd1ddd8ca7408803ed25 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Wed, 1 Nov 2023 03:39:40 -0700
Subject: [PATCH 05/65] TST test coverage for layer matching (#1031)

Add tests for module name matching using regex and other custom arguments.
---
 src/peft/tuners/ia3/model.py    |  17 ++-
 src/peft/tuners/lora/config.py  |  13 ++-
 src/peft/tuners/tuners_utils.py |  15 +++
 tests/test_config.py            |  23 +++++
 tests/test_tuners_utils.py      | 177 ++++++++++++++++++++++++++++++++
 5 files changed, 239 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_tuners_utils.py

diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index 023286d42f..3714134cc2 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -190,10 +190,7 @@ def _create_and_replace(
         current_key = optional_kwargs["current_key"]
 
         # check if target module is in feedforward_modules
-        if isinstance(ia3_config.feedforward_modules, str):
-            is_feedforward = re.fullmatch(ia3_config.feedforward_modules, current_key)
-        else:
-            is_feedforward = any(current_key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
+        is_feedforward = self._check_target_module_feedforward(ia3_config, current_key)
 
         kwargs = {
             "fan_in_fan_out": ia3_config.fan_in_fan_out,
@@ -225,6 +222,18 @@ def _create_and_replace(
                 new_module.requires_grad_(False)
             self._replace_module(parent, target_name, new_module, target)
 
+    @staticmethod
+    def _check_target_module_feedforward(ia3_config, key) -> bool:
+        """
+        A helper private method that checks if the target module `key` matches with a feedforward module specified in
+        `ia3_config`
+        """
+        if isinstance(ia3_config.feedforward_modules, str):
+            is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key))
+        else:
+            is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
+        return is_feedforward
+
     @staticmethod
     def _replace_module(parent, child_name, new_module, child):
         setattr(parent, child_name, new_module)
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 302080a1a8..2412b61a1a 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -88,13 +88,15 @@ class LoraConfig(PeftConfig):
     layers_to_transform: Optional[Union[List[int], int]] = field(
         default=None,
         metadata={
-            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. "
+            "This only works when target_modules is a list of str."
         },
     )
-    layers_pattern: Optional[str] = field(
+    layers_pattern: Optional[Union[List[str], str]] = field(
         default=None,
         metadata={
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+            "This only works when target_modules is a list of str."
         },
     )
     rank_pattern: Optional[dict] = field(
@@ -121,3 +123,10 @@ def __post_init__(self):
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 9307e5fc12..8ca3abfa56 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -389,3 +389,18 @@ def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
                 else:
                     target_module_found = False
     return target_module_found
+
+
+def inspect_matched_modules(tuner: BaseTuner, adapter_name: str = "default") -> dict:
+    """
+    A helper function to inspect the set of matched and unmatched modules for a PEFT model and the given adapter.
+    """
+    config = tuner.peft_config[adapter_name]
+    key_list = [key for key, _ in tuner.model.named_modules()]
+    module_dict = {"matched": [], "unmatched": []}
+    for key in key_list:
+        if tuner._check_target_module_exists(config, key):
+            module_dict["matched"].append(key)
+        else:
+            module_dict["unmatched"].append(key)
+    return module_dict
diff --git a/tests/test_config.py b/tests/test_config.py
index 2e38cba657..7b038f275e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -198,3 +198,26 @@ def test_save_pretrained_with_target_modules(self, config_class):
             self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
             # explicit test that target_modules should be converted to set
             self.assertTrue(isinstance(config_from_pretrained.target_modules, set))
+
+    def test_regex_with_layer_indexing_lora(self):
+        # This test checks that an error is raised if `target_modules` is a regex expression and `layers_to_transform` or
+        # `layers_pattern` are not None
+
+        invalid_config1 = {"target_modules": ".*foo", "layers_to_transform": [0]}
+        invalid_config2 = {"target_modules": ".*foo", "layers_pattern": ["bar"]}
+
+        valid_config = {"target_modules": ["foo"], "layers_pattern": ["bar"], "layers_to_transform": [0]}
+
+        with self.assertRaisesRegex(
+            ValueError,
+            expected_regex="`layers_to_transform` cannot be used when `target_modules` is a str.",
+        ):
+            LoraConfig(**invalid_config1)
+
+        with self.assertRaisesRegex(
+            ValueError, expected_regex="`layers_pattern` cannot be used when `target_modules` is a str."
+        ):
+            LoraConfig(**invalid_config2)
+
+        # should run without errors
+        LoraConfig(**valid_config)
diff --git a/tests/test_tuners_utils.py b/tests/test_tuners_utils.py
new file mode 100644
index 0000000000..7cc0a1767b
--- /dev/null
+++ b/tests/test_tuners_utils.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from parameterized import parameterized
+from transformers import AutoModel
+
+from peft import IA3Config, LoraConfig, get_peft_model
+from peft.tuners.tuners_utils import check_target_module_exists, inspect_matched_modules
+
+
+# Implements tests for regex matching logic common for all BaseTuner subclasses, and also
+# tests for correct behaviour with different config kwargs for BaseTuners (Ex: feedforward for IA3, etc)
+
+TEST_CASES = [
+    # tuple of
+    # 1. key
+    # 2. target_modules
+    # 3. layers_to_transform
+    # 4. layers_pattern
+    # 5. expected result
+    # some basic examples
+    ("", [], None, None, False),
+    ("", ["foo"], None, None, False),
+    ("foo", [], None, None, False),
+    ("foo", ["foo"], None, None, True),
+    ("foo", ["bar"], None, None, False),
+    ("foo", ["foo", "bar"], None, None, True),
+    # with regex
+    ("foo", "foo", None, None, True),
+    ("foo", ".*oo", None, None, True),
+    ("foo", "fo.*", None, None, True),
+    ("foo", ".*bar.*", None, None, False),
+    ("foobar", ".*oba.*", None, None, True),
+    # with layers_to_transform
+    ("foo.bar.1.baz", ["baz"], [1], ["bar"], True),
+    ("foo.bar.1.baz", ["baz"], [0], ["bar"], False),
+    ("foo.bar.1.baz", ["baz"], [2], ["bar"], False),
+    ("foo.bar.10.baz", ["baz"], [0], ["bar"], False),
+    ("foo.bar.10.baz", ["baz"], [1], ["bar"], False),
+    ("foo.bar.1.baz", ["baz"], [0, 1, 2], ["bar"], True),
+    ("foo.bar.1.baz", ["baz", "spam"], [1], ["bar"], True),
+    ("foo.bar.1.baz", ["baz", "spam"], [0, 1, 2], ["bar"], True),
+    # TODO: Unclear what expected behaviour is when layers_pattern is an empty list.
+    # Currently, an empty layers_pattern leads to all layer indexes being matched,
+    # which means layers_to_transform is ignored.
+    ("foo.bar.1.baz", ["baz"], [1], [], True),
+    # TODO: Below test currently fails, again because of empty layers_pattern
+    # layers_to_transform is 0, but layers_pattern is empty, so all layer indexes are matched
+    # ("foo.bar.1.baz", ["baz"], [0], [], False),
+    ("foo.bar.1.baz", ["baz"], [1], ["ar"], True),
+    # some realistic examples: transformers model
+    ("transformer.h.1.attn.attention.q_proj.foo", ["q_proj"], None, [], False),
+    ("transformer.h.1.attn.attention.q_proj", [], None, [], False),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj"], None, [], True),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj", "v_proj"], None, [], True),
+    ("transformer.h.1.attn.attention.resid_dropout", ["q_proj", "v_proj"], None, [], False),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [1], ["h"], True),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [0], ["h"], False),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [2], ["h"], False),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [0, 1, 2], ["h"], True),
+    ("transformer.h.1.attn.attention.q_proj", ["q_proj", "v_proj"], [0, 1, 2], ["h"], True),
+    ("foo.bar.q_proj", ["q_proj"], None, [], True),
+    ("foo.bar.1.baz", ["baz"], [1], ["foo"], False),
+    # other corner cases. For ex, below is a case where layers_pattern
+    # is one of the target nn.modules
+    ("foo.bar.1.baz", ["baz"], [1], ["baz"], False),
+    # here, layers_pattern is 'bar', but only keys that contain '.bar' are valid.
+    ("bar.1.baz", ["baz"], [1], ["bar"], False),
+    ("foo.bar.001.baz", ["baz"], [1], ["bar"], True),
+    ("foo.bar.1.spam.2.baz", ["baz"], [1], ["bar"], True),
+    ("foo.bar.2.spam.1.baz", ["baz"], [1], ["bar"], False),
+    # some realistic examples: module using nn.Sequential
+    # for the below test case, key should contain '.blocks' to be valid, because of how layers_pattern is matched
+    ("blocks.1.weight", ["weight"], [1], ["blocks"], False),
+    ("blocks.1.bias", ["weight"], [1], ["blocks"], False),
+    ("mlp.blocks.1.weight", ["weight"], [1], ["blocks"], True),
+    ("mlp.blocks.1.bias", ["weight"], [1], ["blocks"], False),
+]
+
+
+class PeftCustomKwargsTester(unittest.TestCase):
+    r"""
+    Test if the PeftModel is instantiated with correct behaviour for custom kwargs. This includes:
+    - test if regex matching works correctly
+    - test if adapters handle custom kwargs the right way e.g. IA3 for `feedforward_modules`
+
+    """
+
+    transformers_class = AutoModel
+
+    @parameterized.expand(TEST_CASES)
+    def test_regex_matching_valid(self, key, target_modules, layers_to_transform, layers_pattern, expected_result):
+        # We use a LoRA Config for testing, but the regex matching function is common for all BaseTuner subclasses.
+        # example model_id for config initialization. key is matched only against the target_modules given, so this can be any model
+        model_id = "peft-internal-testing/tiny-OPTForCausalLM-lora"
+        config = LoraConfig(
+            base_model_name_or_path=model_id,
+            target_modules=target_modules,
+            layers_pattern=layers_pattern,
+            layers_to_transform=layers_to_transform,
+        )
+        actual_result = bool(check_target_module_exists(config, key))
+        self.assertEqual(actual_result, expected_result)
+
+    def test_module_matching_lora(self):
+        # peft models that have a module matching method to inspect the matching modules to allow
+        # users to easily debug their configuration. Here we only test a single case, not all possible combinations of
+        # configs that could exist. This is okay as the method calls `check_target_module_exists` internally, which
+        # has been extensively tested above.
+        model_id = "hf-internal-testing/tiny-random-BloomForCausalLM"
+        model = self.transformers_class.from_pretrained(model_id)
+        # by default, this model matches query_key_value
+        config = LoraConfig()
+        peft_model = get_peft_model(model, config)
+
+        output = inspect_matched_modules(peft_model)  # inspects default adapter for peft_model
+        matched = output["matched"]
+        expected = [
+            "h.0.self_attention.query_key_value",
+            "h.1.self_attention.query_key_value",
+            "h.2.self_attention.query_key_value",
+            "h.3.self_attention.query_key_value",
+            "h.4.self_attention.query_key_value",
+        ]
+        self.assertEqual(matched, expected)  # module lists should match exactly
+
+        # no overlap with matched modules
+        unmatched = output["unmatched"]
+        for key in expected:
+            self.assertFalse(key in unmatched)
+
+    def test_feedforward_matching_ia3(self):
+        model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
+        model = self.transformers_class.from_pretrained(model_id)
+        # simple example for just one t5 block for testing
+        config_kwargs = {
+            "target_modules": ".*encoder.*block.0.*(SelfAttention|EncDecAttention|DenseReluDense).(k|q|v|wo|wi)$",
+            "feedforward_modules": ["wo", "wi"],
+        }
+        config = IA3Config(base_model_name_or_path=model_id, **config_kwargs)
+        peft_model = get_peft_model(model, config)
+        output = inspect_matched_modules(peft_model)  # inspects default adapter for peft_model
+        matched = output["matched"]
+        expected = [
+            "encoder.block.0.layer.0.SelfAttention.q",
+            "encoder.block.0.layer.0.SelfAttention.k",
+            "encoder.block.0.layer.0.SelfAttention.v",
+            "encoder.block.0.layer.1.DenseReluDense.wi",
+            "encoder.block.0.layer.1.DenseReluDense.wo",
+        ]
+        expected_feedforward = [
+            "encoder.block.0.layer.1.DenseReluDense.wi",
+            "encoder.block.0.layer.1.DenseReluDense.wo",
+        ]
+        self.assertEqual(matched, expected)  # not required since we do similar checks above, but just to be sure
+        module_dict = dict(model.named_modules())
+        for key in matched:
+            module = module_dict[key]
+            if key in expected_feedforward:
+                self.assertTrue(module.is_feedforward)
+            else:  # other IA3 modules should not be marked as feedforward
+                self.assertFalse(module.is_feedforward)

From 9da72d25ed209f5c3956ad54ee8e46f5d347a25e Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 1 Nov 2023 12:41:23 +0100
Subject: [PATCH 06/65] Fix Slack bot not displaying error messages (#1068)

* Update log_reports.py

* Update log_reports.py

* Update log_reports.py

* change logic

* fix
---
 scripts/log_reports.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/log_reports.py b/scripts/log_reports.py
index ebd3037d44..ead94a0218 100644
--- a/scripts/log_reports.py
+++ b/scripts/log_reports.py
@@ -10,12 +10,16 @@
 
 total_num_failed = 0
 empty_file = False or len(list(Path().glob("*.log"))) == 0
+
+total_empty_files = []
+
 for log in Path().glob("*.log"):
     section_num_failed = 0
+    i = 0
     with open(log, "r") as f:
-        nb_lines = sum(1 for _ in f)
-        for i, line in f:
+        for line in f:
             line = json.loads(line)
+            i += 1
             if line.get("nodeid", "") != "":
                 test = line["nodeid"]
                 if line.get("duration", None) is not None:
@@ -26,16 +30,16 @@
                         total_num_failed += 1
                     else:
                         passed.append([test, duration, log.name.split('_')[0]])
-        if nb_lines == 0:
-            empty_file = True
+        empty_file = i == 0
     group_info.append([str(log), section_num_failed, failed])
+    total_empty_files.append(empty_file)
     os.remove(log)
     failed = []
 no_error_payload = {
     "type": "section",
     "text": {
         "type": "plain_text",
-        "text": "🌞 There were no failures!" if not empty_file else "Something went wrong - please check GH action results.",
+        "text": "🌞 There were no failures!" if not any(total_empty_files) else "Something went wrong there is at least one empty file - please check GH action results.",
         "emoji": True
     }
 }
@@ -51,7 +55,7 @@
     },
 ]
 if total_num_failed > 0:
-    for name, num_failed, failed_tests in group_info:
+    for i, (name, num_failed, failed_tests) in enumerate(group_info):
         if num_failed > 0:
             if num_failed == 1:
                 message += f"*{name}: {num_failed} failed test*\n"
@@ -62,10 +66,12 @@
                 failed_table.append(test[0].split("::"))
             failed_table = tabulate(failed_table, headers=["Test Location", "Test Case", "Test Name"], showindex="always", tablefmt="grid", maxcolwidths=[12, 12, 12])
             message += '\n```\n' +failed_table + '\n```'
+        
+        if total_empty_files[i]:
+            message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n"
     print(f'### {message}')
 else:
     payload.append(no_error_payload)
-    
 
 if os.environ.get("TEST_TYPE", "") != "":
     from slack_sdk import WebClient

From 23cfbf22eb397ab3ff3126748b27a820ae88c1b1 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 2 Nov 2023 10:23:17 +0100
Subject: [PATCH 07/65] Fix slow tests not running (#1071)

* Update nightly.yml

* Update nightly.yml
---
 .github/workflows/nightly.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index d0c537a947..86a6e271c1 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -21,7 +21,7 @@ jobs:
       TEST_TYPE: "single_gpu"
     container:
       image: huggingface/peft-gpu:latest
-      options: --gpus all --shm-size "16gb"
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
     defaults:
       run:
         working-directory: peft/
@@ -63,7 +63,7 @@ jobs:
       TEST_TYPE: "multi_gpu"
     container:
       image: huggingface/peft-gpu:latest
-      options: --gpus all --shm-size "16gb"
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
     defaults:
       run:
         working-directory: peft/

From 02f0a4ca5992bf516b9807c5870811ef8ad199fa Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:07:03 +0100
Subject: [PATCH 08/65] Release version 0.6.0 (#1072)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index af00c2870c..a5b8009b44 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.0.dev0",
+    version="0.6.0",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 53ba2bd568..6d23092e25 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.0.dev0"
+__version__ = "0.6.0"
 
 from .auto import (
     AutoPeftModel,

From d47d23aa0e41b8a65e85bc225b786836e3ce5ab1 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:25:04 +0100
Subject: [PATCH 09/65] After release: Bump version to 0.7.0.dev0 (#1074)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a5b8009b44..bf30c3a8d0 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.0",
+    version="0.7.0.dev0",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 6d23092e25..601f95b32a 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.0"
+__version__ = "0.7.0.dev0"
 
 from .auto import (
     AutoPeftModel,

From cfe35a7878b44e017836f0b0c0c3b3e9e0cb738b Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 3 Nov 2023 15:52:51 +0100
Subject: [PATCH 10/65] FIX: Skip adaption prompt tests with new transformers
 versions (#1077)

Adaption prompt is failing with transformers v4.35.0. This PR skips the
adaption prompt tests so that CI is green again. The PR also adds an
error when users try to use adaption prompt with that version,
instructing them to use an older transformers version instead.

This should be removed as soon as the issue is fixed in
PEFT/transformers.
---
 src/peft/tuners/adaption_prompt/config.py | 17 +++++++++++++++++
 tests/test_adaption_prompt.py             |  8 +++++++-
 tests/test_config.py                      |  6 ++++--
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/peft/tuners/adaption_prompt/config.py b/src/peft/tuners/adaption_prompt/config.py
index 37d206248a..d2d0f07f78 100644
--- a/src/peft/tuners/adaption_prompt/config.py
+++ b/src/peft/tuners/adaption_prompt/config.py
@@ -13,15 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 from collections import namedtuple
 from dataclasses import dataclass, field
 
+from packaging.version import parse
+
 from peft.config import PeftConfig
 from peft.utils import PeftType
 
 from .utils import llama_compute_query_states
 
 
+MAX_TRANSFORMERS_VERSION = "4.35.0"
+
+
+def is_transformers_version_ge(version: str) -> bool:
+    return parse(importlib.metadata.version("transformers")) >= parse(version)
+
+
 @dataclass
 class AdaptionPromptConfig(PeftConfig):
     """Stores the configuration of an [`AdaptionPromptModel`]."""
@@ -33,6 +43,13 @@ class AdaptionPromptConfig(PeftConfig):
     adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
 
     def __post_init__(self):
+        # TODO: Remove this check and function once PEFT works again with newest transformers version.
+        # Also remove the skip in test_adaption_prompt.py and uncomment the adaption prompt config in test_config.py.
+        if is_transformers_version_ge(MAX_TRANSFORMERS_VERSION):
+            raise ValueError(
+                f"Adaption prompt is not compatible with transformers >= {MAX_TRANSFORMERS_VERSION}, "
+                "please use an older version of transformers until this is fixed."
+            )
         self.peft_type = PeftType.ADAPTION_PROMPT
 
     @property
diff --git a/tests/test_adaption_prompt.py b/tests/test_adaption_prompt.py
index 1f666e51d2..363591bda3 100644
--- a/tests/test_adaption_prompt.py
+++ b/tests/test_adaption_prompt.py
@@ -53,7 +53,13 @@ class AdaptionPromptTester(TestCase, PeftCommonTester):
     """
 
     def setUp(self):
-        """Check that llama is available in transformers package before running each test."""
+        # TODO: remove the imports and version check once PEFT works again with transformers
+        from peft.tuners.adaption_prompt.config import MAX_TRANSFORMERS_VERSION, is_transformers_version_ge
+
+        if is_transformers_version_ge(MAX_TRANSFORMERS_VERSION):
+            self.skipTest("Adaption prompt is currently failing on transformers 4.35.0, skipping test.")
+
+        # Check that llama is available in transformers package before running each test.
         if not is_llama_available():
             self.skipTest("Llama not available in transformers. Skipping test.")
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 7b038f275e..57e83965c9 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -24,7 +24,8 @@
 
 from peft import (
     AdaLoraConfig,
-    AdaptionPromptConfig,
+    # TODO: uncomment once PEFT works again with transformers
+    # AdaptionPromptConfig,
     IA3Config,
     LoHaConfig,
     LoraConfig,
@@ -40,7 +41,8 @@
 PEFT_MODELS_TO_TEST = [("lewtun/tiny-random-OPTForCausalLM-delta", "v1")]
 
 ALL_CONFIG_CLASSES = (
-    AdaptionPromptConfig,
+    # TODO: uncomment once PEFT works again with transformers
+    # AdaptionPromptConfig,
     AdaLoraConfig,
     IA3Config,
     LoHaConfig,

From 276c91b143cf593a3fa3b0cfd9def9d42066cd08 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 6 Nov 2023 14:04:19 +0100
Subject: [PATCH 11/65] FIX: fix adaptation prompt CI and compatibility with
 latest transformers (4.35.0) (#1084)

* fix adaptation prompt CI

* undo some other changes
---
 src/peft/tuners/adaption_prompt/config.py | 17 -----------------
 src/peft/tuners/adaption_prompt/utils.py  | 18 +++++++++++++-----
 tests/test_adaption_prompt.py             |  6 ------
 tests/test_config.py                      |  4 ++--
 4 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/src/peft/tuners/adaption_prompt/config.py b/src/peft/tuners/adaption_prompt/config.py
index d2d0f07f78..37d206248a 100644
--- a/src/peft/tuners/adaption_prompt/config.py
+++ b/src/peft/tuners/adaption_prompt/config.py
@@ -13,25 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import importlib
 from collections import namedtuple
 from dataclasses import dataclass, field
 
-from packaging.version import parse
-
 from peft.config import PeftConfig
 from peft.utils import PeftType
 
 from .utils import llama_compute_query_states
 
 
-MAX_TRANSFORMERS_VERSION = "4.35.0"
-
-
-def is_transformers_version_ge(version: str) -> bool:
-    return parse(importlib.metadata.version("transformers")) >= parse(version)
-
-
 @dataclass
 class AdaptionPromptConfig(PeftConfig):
     """Stores the configuration of an [`AdaptionPromptModel`]."""
@@ -43,13 +33,6 @@ class AdaptionPromptConfig(PeftConfig):
     adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
 
     def __post_init__(self):
-        # TODO: Remove this check and function once PEFT works again with newest transformers version.
-        # Also remove the skip in test_adaption_prompt.py and uncomment the adaption prompt config in test_config.py.
-        if is_transformers_version_ge(MAX_TRANSFORMERS_VERSION):
-            raise ValueError(
-                f"Adaption prompt is not compatible with transformers >= {MAX_TRANSFORMERS_VERSION}, "
-                "please use an older version of transformers until this is fixed."
-            )
         self.peft_type = PeftType.ADAPTION_PROMPT
 
     @property
diff --git a/src/peft/tuners/adaption_prompt/utils.py b/src/peft/tuners/adaption_prompt/utils.py
index da86db780d..921982fbb7 100644
--- a/src/peft/tuners/adaption_prompt/utils.py
+++ b/src/peft/tuners/adaption_prompt/utils.py
@@ -39,12 +39,20 @@ def llama_apply_rotary_pos_emb(q, cos, sin, position_ids):
     This function was adapted from:
     https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133
 
-    It was modified to remove unnecessary processing of key states.
+    It was modified to remove unnecessary processing of key states. The method is compatible with transformers <=
+    4.34.2 and also with the latest version (>=4.35).
     """
-    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
-    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
-    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
-    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    # In previous transformers version cos/sin cached had a shape of 4D
+    if len(cos.shape) == 4:
+        gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+        gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+        cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+        sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    # In the new version, it is 2D so we fall back to the new implementation
+    # https://github.com/huggingface/transformers/blame/eef7ea98c31a333bacdc7ae7a2372bde772be8e4/src/transformers/models/llama/modeling_llama.py#L222-L226
+    else:
+        cos = cos[position_ids].unsqueeze(1)
+        sin = sin[position_ids].unsqueeze(1)
     q_embed = (q * cos) + (llama_rotate_half(q) * sin)
     return q_embed
 
diff --git a/tests/test_adaption_prompt.py b/tests/test_adaption_prompt.py
index 363591bda3..d3f2e74140 100644
--- a/tests/test_adaption_prompt.py
+++ b/tests/test_adaption_prompt.py
@@ -53,12 +53,6 @@ class AdaptionPromptTester(TestCase, PeftCommonTester):
     """
 
     def setUp(self):
-        # TODO: remove the imports and version check once PEFT works again with transformers
-        from peft.tuners.adaption_prompt.config import MAX_TRANSFORMERS_VERSION, is_transformers_version_ge
-
-        if is_transformers_version_ge(MAX_TRANSFORMERS_VERSION):
-            self.skipTest("Adaption prompt is currently failing on transformers 4.35.0, skipping test.")
-
         # Check that llama is available in transformers package before running each test.
         if not is_llama_available():
             self.skipTest("Llama not available in transformers. Skipping test.")
diff --git a/tests/test_config.py b/tests/test_config.py
index 57e83965c9..62497ae9e5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -25,7 +25,7 @@
 from peft import (
     AdaLoraConfig,
     # TODO: uncomment once PEFT works again with transformers
-    # AdaptionPromptConfig,
+    AdaptionPromptConfig,
     IA3Config,
     LoHaConfig,
     LoraConfig,
@@ -42,7 +42,7 @@
 
 ALL_CONFIG_CLASSES = (
     # TODO: uncomment once PEFT works again with transformers
-    # AdaptionPromptConfig,
+    AdaptionPromptConfig,
     AdaLoraConfig,
     IA3Config,
     LoHaConfig,

From 45343a4ccc7e94b47aeebbcc1bbc7c8d01556065 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Tue, 7 Nov 2023 02:44:27 -0800
Subject: [PATCH 12/65] =?UTF-8?q?Improve=20documentation=20for=20IA=C2=B3?=
 =?UTF-8?q?=20(#984)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Improve ia3 documentation
- Raise value error for incorrect feedforward_module list
- Added tests

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 docs/source/conceptual_guides/ia3.mdx | 26 ++++++++++++++++++-------
 src/peft/tuners/ia3/config.py         |  9 ++++++++-
 tests/test_config.py                  | 28 +++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/docs/source/conceptual_guides/ia3.mdx b/docs/source/conceptual_guides/ia3.mdx
index 482bba01d7..cb04d4818e 100644
--- a/docs/source/conceptual_guides/ia3.mdx
+++ b/docs/source/conceptual_guides/ia3.mdx
@@ -28,10 +28,13 @@ Being similar to LoRA, IA3 carries many of the same advantages:
 * Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models.
 * IA3 does not add any inference latency because adapter weights can be merged with the base model.
 
-In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable 
-parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers 
-of a Transformer model. Given the target layers for injecting IA3 parameters, the number of trainable parameters 
-can be determined based on the size of the weight matrices. 
+In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable
+parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers
+of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer
+in each transformer block.
+
+Given the target layers for injecting IA3 parameters, the number of trainable parameters
+can be determined based on the size of the weight matrices.
 
 
 ## Common IA3 parameters in PEFT
@@ -43,10 +46,19 @@ As with other methods supported by PEFT, to fine-tune a model using IA3, you nee
 3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`.
 4. Train the `PeftModel` as you normally would train the base model.
 
-`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: 
+`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters:
 
 - `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors.
-- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with 
-the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers.
+- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with
+the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers. Note that `feedforward_modules` must be a subset of `target_modules`.
 - `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task.
 
+## Example Usage
+
+For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows:
+
+```py
+peft_config = IA3Config(
+    task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"]
+)
+```
\ No newline at end of file
diff --git a/src/peft/tuners/ia3/config.py b/src/peft/tuners/ia3/config.py
index f36ef77348..24da508618 100644
--- a/src/peft/tuners/ia3/config.py
+++ b/src/peft/tuners/ia3/config.py
@@ -29,7 +29,9 @@ class IA3Config(PeftConfig):
         target_modules (`Union[List[str],str]`):
             The names of the modules to apply (IA)^3 to.
         feedforward_modules (`Union[List[str],str]`):
-            The names of the modules to be treated as feedforward modules, as in the original paper.
+            The names of the modules to be treated as feedforward modules, as in the original paper. These modules will
+            have (IA)^3 vectors multiplied to the input, instead of the output. feedforward_modules must be a name or a
+            subset of names present in target_modules.
         fan_in_fan_out (`bool`):
             Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
             `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
@@ -78,3 +80,8 @@ def __post_init__(self):
         self.feedforward_modules = (
             set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules
         )
+
+        # check if feedforward_modules is a subset of target_modules. run the check only if both are sets
+        if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set):
+            if not self.feedforward_modules.issubset(self.target_modules):
+                raise ValueError("`feedforward_modules` should be a subset of `target_modules`")
diff --git a/tests/test_config.py b/tests/test_config.py
index 62497ae9e5..34f04232a9 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -223,3 +223,31 @@ def test_regex_with_layer_indexing_lora(self):
 
         # should run without errors
         LoraConfig(**valid_config)
+
+    def test_ia3_is_feedforward_subset_invalid_config(self):
+        # This test checks that the IA3 config raises a value error if the feedforward_modules argument
+        # is not a subset of the target_modules argument
+
+        # an example invalid config
+        invalid_config = {"target_modules": ["k", "v"], "feedforward_modules": ["q"]}
+
+        with self.assertRaisesRegex(
+            ValueError, expected_regex="^`feedforward_modules` should be a subset of `target_modules`$"
+        ):
+            IA3Config(**invalid_config)
+
+    def test_ia3_is_feedforward_subset_valid_config(self):
+        # This test checks that the IA3 config is created without errors with valid arguments.
+        # feedforward_modules should be a subset of target_modules if both are lists
+
+        # an example valid config with regex expressions.
+        valid_config_regex_exp = {
+            "target_modules": ".*.(SelfAttention|EncDecAttention|DenseReluDense).*(q|v|wo)$",
+            "feedforward_modules": ".*.DenseReluDense.wo$",
+        }
+        # an example valid config with module lists.
+        valid_config_list = {"target_modules": ["k", "v", "wo"], "feedforward_modules": ["wo"]}
+
+        # should run without errors
+        IA3Config(**valid_config_regex_exp)
+        IA3Config(**valid_config_list)

From 46e03602ed3f273d76b024572f2f1346661be4a7 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 7 Nov 2023 12:20:15 +0100
Subject: [PATCH 13/65] [`Docker`] Update Dockerfile to force-use transformers
 main (#1085)

* Update Dockerfile

* Update Dockerfile

* Update Dockerfile
---
 docker/peft-gpu/Dockerfile | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
index 9b538d218d..52af326eed 100644
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@@ -29,15 +29,6 @@ ENV PATH /opt/conda/envs/peft/bin:$PATH
 # Activate our bash shell
 RUN chsh -s /bin/bash
 SHELL ["/bin/bash", "-c"]
-# Activate the conda env and install transformers + accelerate from source
-RUN source activate peft && \
-    python3 -m pip install --no-cache-dir \
-    librosa \
-    "soundfile>=0.12.1" \
-    scipy \
-    git+https://github.com/huggingface/transformers \
-    git+https://github.com/huggingface/accelerate \
-    peft[test]@git+https://github.com/huggingface/peft
 
 # Stage 2
 FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
@@ -49,6 +40,18 @@ SHELL ["/bin/bash", "-c"]
 RUN source activate peft && \ 
     python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
 
+# Activate the conda env and install transformers + accelerate from source
+RUN source activate peft && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft
+
+RUN pip freeze | grep transformers
+
 # Install apt libs
 RUN apt-get update && \
     apt-get install -y curl git wget && \

From 4c48970cb008b0daebce257d6de81ef134efc2b2 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 7 Nov 2023 14:23:38 +0100
Subject: [PATCH 14/65] Update the release checklist (#1075)

As discussed, we wanted to make small amendments to the release process,
so that we have a 0.N.0 commit on main. I also adjusted the wording here
and there.
---
 setup.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index bf30c3a8d0..6f807c4c22 100644
--- a/setup.py
+++ b/setup.py
@@ -63,19 +63,23 @@
 )
 
 # Release checklist
-# 1. Change the version in __init__.py and setup.py.
-# 2. Commit these changes with the message: "Release: VERSION"
-# 3. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
-#    Push the tag to git: git push --tags origin main
-# 4. Run the following commands in the top-level directory:
+# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.0.dev0" to "0.6.0"
+# 2. Check if there are any deprecations that need to be addressed for this release by seaching for "# TODO" in the code
+# 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it.
+# 4. Add a tag in git to mark the release: "git tag -a VERSION -m 'Adds tag VERSION for pypi' "
+#    Push the tag to git:
+#      git push --tags origin main
+#    It is necessary to work on the original repository, not on a fork.
+# 5. Run the following commands in the top-level directory:
 #      python setup.py bdist_wheel
 #      python setup.py sdist
-# 5. Upload the package to the pypi test server first:
+#    Ensure that you are on the clean and up-to-date main branch (git status --untracked-files=no should not list any
+#    files and show the main branch)
+# 6. Upload the package to the pypi test server first:
 #      twine upload dist/* -r pypitest
-#      twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
-# 6. Check that you can install it in a virtualenv by running:
+# 7. Check that you can install it in a virtualenv by running:
 #      pip install -i https://testpypi.python.org/pypi peft
-# 7. Upload the final version to actual pypi:
+# 8. Upload the final version to actual pypi:
 #      twine upload dist/* -r pypi
-# 8. Add release notes to the tag in github once everything is looking hunky-dory.
-# 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
+# 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
+# 10. Update the version in __init__.py, setup.py to the bumped minor version + ".dev0" (e.g. from "0.6.0" to "0.7.0.dev0")

From ed4ce9fc94662a4825ef869c98e6de8c16d8ef28 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Tue, 7 Nov 2023 17:12:23 +0100
Subject: [PATCH 15/65] fix-gptq-training (#1086)

* fix-gptq-training

* style

* review
---
 src/peft/import_utils.py   | 16 ++++++++++++++--
 src/peft/utils/other.py    | 12 ++++++++++--
 tests/test_gpu_examples.py |  3 ++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
index 991e58e88b..a33ac333d7 100644
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import importlib.metadata as importlib_metadata
+
+import packaging.version
 
 
 def is_bnb_available() -> bool:
@@ -28,8 +31,17 @@ def is_bnb_4bit_available() -> bool:
     return hasattr(bnb.nn, "Linear4bit")
 
 
-def is_auto_gptq_available() -> bool:
-    return importlib.util.find_spec("auto_gptq") is not None
+def is_auto_gptq_available():
+    if importlib.util.find_spec("auto_gptq") is not None:
+        AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0")
+        version_autogptq = packaging.version.parse(importlib_metadata.version("auto_gptq"))
+        if AUTOGPTQ_MINIMUM_VERSION <= version_autogptq:
+            return True
+        else:
+            raise ImportError(
+                f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, "
+                "but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+            )
 
 
 def is_optimum_available() -> bool:
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index e33c52e21e..05b1ee69bd 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -419,13 +419,21 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
             desc_act = gptq_quantization_config.desc_act
             group_size = gptq_quantization_config.group_size
             bits = gptq_quantization_config.bits
-            disable_exllama = gptq_quantization_config.disable_exllama
+            if hasattr(gptq_quantization_config, "use_exllama"):
+                use_exllama = gptq_quantization_config.use_exllama
+            else:
+                use_exllama = not gptq_quantization_config.disable_exllama
+            if hasattr(gptq_quantization_config, "exllama_config"):
+                exllama_version = gptq_quantization_config.exllama_config["version"]
+            else:
+                exllama_version = 1
             AutoGPTQQuantLinear = dynamically_import_QuantLinear(
                 use_triton=False,
                 desc_act=desc_act,
                 group_size=group_size,
                 bits=bits,
-                disable_exllama=disable_exllama,
+                disable_exllama=not (use_exllama and exllama_version == 1),
+                disable_exllamav2=not (use_exllama and exllama_version == 2),
             )
             return AutoGPTQQuantLinear
     return None
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index fbabb8bb6c..9c4fad2132 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -658,7 +658,8 @@ def setUp(self):
         from transformers import GPTQConfig
 
         self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit"
-        self.quantization_config = GPTQConfig(bits=4, disable_exllama=True)
+        # TODO : check if it works for Exllamav2 kernels
+        self.quantization_config = GPTQConfig(bits=4, use_exllama=False)
         self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
 
     def tearDown(self):

From 493ae58beb15f1ff533eaee253ee477c36dd6c03 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 8 Nov 2023 14:47:55 +0530
Subject: [PATCH 16/65] fix the failing CI tests (#1094)

---
 tests/test_common_gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
index f329600f94..e5a9f92805 100644
--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@@ -158,8 +158,8 @@ def test_ia3_bnb_8bit_quantization(self):
         flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM")
 
         opt_ia3_config = IA3Config(
-            target_modules=["q_proj", "v_proj"],
-            feedforward_modules=["down_proj"],
+            target_modules=["q_proj", "v_proj", "fc2"],
+            feedforward_modules=["fc2"],
             task_type="CAUSAL_LM",
         )
 

From d9094cebea954760259295875b656846f9364d39 Mon Sep 17 00:00:00 2001
From: KCFindstr <shimakaze@rabimimi.com>
Date: Wed, 8 Nov 2023 03:12:24 -0800
Subject: [PATCH 17/65] FIX: broken f-string in import_utils (#1091)

---
 src/peft/import_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
index a33ac333d7..a62525c1c4 100644
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@@ -40,7 +40,7 @@ def is_auto_gptq_available():
         else:
             raise ImportError(
                 f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, "
-                "but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+                f"but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported"
             )
 
 

From face67dfeb552cd0d8c2b2c9683f125639d4e793 Mon Sep 17 00:00:00 2001
From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com>
Date: Wed, 8 Nov 2023 23:11:57 -0800
Subject: [PATCH 18/65] Fix IA3 config for Falcon models (#1007)

* fixed feedforward for falcon

* fixed target_modules for falcon
---
 src/peft/utils/other.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 05b1ee69bd..1679a3fec6 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -486,9 +486,9 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
     "bert": ["key", "value", "output.dense"],
     "deberta-v2": ["key_proj", "value_proj", "output.dense"],
     "deberta": ["in_proj", "output.dense"],
-    "RefinedWebModel": ["query_key_value"],
-    "RefinedWeb": ["query_key_value"],
-    "falcon": ["query_key_value"],
+    "RefinedWebModel": ["query_key_value", "dense_4h_to_h"],
+    "RefinedWeb": ["query_key_value", "dense_4h_to_h"],
+    "falcon": ["query_key_value", "dense_4h_to_h"],
 }
 
 TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = {
@@ -507,9 +507,9 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
     "bert": ["output.dense"],
     "deberta-v2": ["output.dense"],
     "deberta": ["output.dense"],
-    "RefinedWeb": ["query_key_value"],
-    "RefinedWebModel": ["query_key_value"],
-    "falcon": ["query_key_value"],
+    "RefinedWeb": ["dense_4h_to_h"],
+    "RefinedWebModel": ["dense_4h_to_h"],
+    "falcon": ["dense_4h_to_h"],
 }
 
 COMMON_LAYERS_PATTERN = ["layers", "h", "block", "blocks", "layer"]

From c5d94855cd101fe7ae5248953d137d941582b643 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 9 Nov 2023 13:50:44 +0100
Subject: [PATCH 19/65] FIX Failing nightly CI tests due to IA3 config (#1100)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same idea as in PR as #1094, but for yet more ill-configured IA³
configs. The tests are now failing because we do stricter checks on
incorrect IA³ configs.
---
 tests/test_common_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
index e5a9f92805..1228eb665f 100644
--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@@ -163,7 +163,7 @@ def test_ia3_bnb_8bit_quantization(self):
             task_type="CAUSAL_LM",
         )
 
-        config = IA3Config(target_modules=["q_proj", "v_proj"], feedforward_modules=["down_proj"])
+        config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"])
 
         flan_8bit = get_peft_model(flan_8bit, flan_ia3_config)
         self.assertTrue(
@@ -276,12 +276,12 @@ def test_ia3_bnb_4bit_quantization(self):
         flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM")
 
         opt_ia3_config = IA3Config(
-            target_modules=["q_proj", "v_proj"],
-            feedforward_modules=["down_proj"],
+            target_modules=["q_proj", "v_proj", "fc2"],
+            feedforward_modules=["fc2"],
             task_type="CAUSAL_LM",
         )
 
-        config = IA3Config(target_modules=["q_proj", "v_proj"], feedforward_modules=["down_proj"])
+        config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"])
 
         flan_4bit = get_peft_model(flan_4bit, flan_ia3_config)
         self.assertTrue(

From b5641cc7443a35cd36addb2c43bd5d51b6e75485 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Nov 2023 14:50:35 +0100
Subject: [PATCH 20/65] [`core`] Fix safetensors serialization for shared
 tensors (#1101)

* fix st serialization

* add test

* add CI test

* add comment
---
 src/peft/peft_model.py               | 26 ++++++++++++++++++++++++++
 tests/test_common_gpu.py             | 16 ++++++++++++++++
 tests/test_encoder_decoder_models.py | 22 +++++++++++++++++++++-
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index 64e70f2ba7..fb4beef27d 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import collections
 import inspect
 import os
 import warnings
@@ -31,6 +32,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
+from transformers.pytorch_utils import id_tensor_storage
 from transformers.utils import PushToHubMixin
 
 from . import __version__
@@ -168,6 +170,8 @@ def save_pretrained(
             save_directory (`str`):
                 Directory where the adapter model and configuration files will be saved (will be created if it does not
                 exist).
+            safe_serialization (`bool`, *optional*):
+                Whether to save the adapter files in safetensors format.
             kwargs (additional keyword arguments, *optional*):
                 Additional keyword arguments passed along to the `push_to_hub` method.
         """
@@ -199,6 +203,28 @@ def save_pretrained(
             os.makedirs(output_dir, exist_ok=True)
 
             if safe_serialization:
+                # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
+                # Safetensors does not allow tensor aliasing.
+                # We're going to remove aliases before saving
+                ptrs = collections.defaultdict(list)
+                for name, tensor in output_state_dict.items():
+                    # Sometimes in the state_dict we have non-tensor objects.
+                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
+                    if isinstance(tensor, torch.Tensor):
+                        ptrs[id_tensor_storage(tensor)].append(name)
+                    else:
+                        # In the non-tensor case, fall back to the pointer of the object itself
+                        ptrs[id(tensor)].append(name)
+
+                # These are all the pointers of shared tensors.
+                shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+
+                for _, names in shared_ptrs.items():
+                    # Here we just clone the shared tensors to avoid tensor aliasing which is
+                    # not supported in safetensors.
+                    for shared_tensor_name in names[1:]:
+                        output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone()
+
                 safe_save_file(
                     output_state_dict,
                     os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
index 1228eb665f..ae13ef1988 100644
--- a/tests/test_common_gpu.py
+++ b/tests/test_common_gpu.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import tempfile
 import unittest
 
 import pytest
@@ -22,6 +23,7 @@
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
     AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
     AutoTokenizer,
     BitsAndBytesConfig,
     LlamaForCausalLM,
@@ -33,6 +35,7 @@
     IA3Config,
     LoraConfig,
     PeftModel,
+    TaskType,
     get_peft_model,
     prepare_model_for_kbit_training,
 )
@@ -631,3 +634,16 @@ def test_4bit_merge_and_disable_lora(self):
         self.assertTrue(isinstance(model, PeftModel))
         self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit))
         self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit))
+
+    @require_torch_gpu
+    @pytest.mark.single_gpu_tests
+    def test_serialization_shared_tensors(self):
+        model_checkpoint = "roberta-base"
+        peft_config = LoraConfig(
+            task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
+        )
+        model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=11).to("cuda")
+        model = get_peft_model(model, peft_config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, safe_serialization=True)
diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py
index cf200399bf..9acc1c63a2 100644
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@@ -12,11 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import tempfile
 import unittest
 
 import torch
 from parameterized import parameterized
-from transformers import AutoModelForSeq2SeqLM
+from transformers import AutoModelForSeq2SeqLM, AutoModelForTokenClassification
+
+from peft import LoraConfig, TaskType, get_peft_model
 
 from .testing_common import PeftCommonTester, PeftTestConfigManager
 
@@ -172,3 +175,20 @@ def test_training_prompt_learning_tasks(self, test_name, model_id, config_cls, c
     )
     def test_disable_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_disable_adapter(model_id, config_cls, config_kwargs)
+
+
+class PeftEncoderDecoderCustomModelTester(unittest.TestCase):
+    """
+    A custom class to write any custom test related with Enc-Dec models
+    """
+
+    def test_save_shared_tensors(self):
+        model_id = "hf-internal-testing/tiny-random-RobertaModel"
+        peft_config = LoraConfig(
+            task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
+        )
+        model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=11)
+        model = get_peft_model(model, peft_config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # This should work fine
+            model.save_pretrained(tmp_dir, safe_serialization=True)

From 669dd4edeb60a7fdd15f6ce1743cf4775a53e42a Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Nov 2023 15:03:15 +0100
Subject: [PATCH 21/65] Change to 0.6.1.dev0 (#1102)

* change to 0.6.1.dev0

* oops
---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 6f807c4c22..0cccd87b48 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.7.0.dev0",
+    version="0.6.1.dev0",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 601f95b32a..16d74b56e7 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.7.0.dev0"
+__version__ = "0.6.1.dev0"
 
 from .auto import (
     AutoPeftModel,

From 2efd02769b1a37dc3abd01ac1e7c5b38ddf72e7f Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Nov 2023 15:16:33 +0100
Subject: [PATCH 22/65] Release: 0.6.1 (#1103)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0cccd87b48..a3bcfa6fc4 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.1.dev0",
+    version="0.6.1",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 16d74b56e7..2e6cdff0e5 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.1.dev0"
+__version__ = "0.6.1"
 
 from .auto import (
     AutoPeftModel,

From 9d8287f3e32be7b22bd77fe23b96054cff291642 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Nov 2023 15:44:28 +0100
Subject: [PATCH 23/65] set dev version (#1104)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a3bcfa6fc4..1418afa46a 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.1",
+    version="0.6.2.dev0",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 2e6cdff0e5..4a5ecb7de1 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.1"
+__version__ = "0.6.2.dev0"
 
 from .auto import (
     AutoPeftModel,

From 5e7e5ad83645a75934e6555185384ad535d6bd0b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 10 Nov 2023 06:35:18 -0500
Subject: [PATCH 24/65] Avoid over-eager auto-gptq import (#1109)

---
 src/peft/utils/other.py | 43 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 1679a3fec6..c879a45a0f 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -412,30 +412,29 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
     """
     Get the right AutoGPTQQuantLinear class based on the quantization config file
     """
-    if is_auto_gptq_available():
+    if gptq_quantization_config is not None and is_auto_gptq_available():
         from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
 
-        if gptq_quantization_config is not None:
-            desc_act = gptq_quantization_config.desc_act
-            group_size = gptq_quantization_config.group_size
-            bits = gptq_quantization_config.bits
-            if hasattr(gptq_quantization_config, "use_exllama"):
-                use_exllama = gptq_quantization_config.use_exllama
-            else:
-                use_exllama = not gptq_quantization_config.disable_exllama
-            if hasattr(gptq_quantization_config, "exllama_config"):
-                exllama_version = gptq_quantization_config.exllama_config["version"]
-            else:
-                exllama_version = 1
-            AutoGPTQQuantLinear = dynamically_import_QuantLinear(
-                use_triton=False,
-                desc_act=desc_act,
-                group_size=group_size,
-                bits=bits,
-                disable_exllama=not (use_exllama and exllama_version == 1),
-                disable_exllamav2=not (use_exllama and exllama_version == 2),
-            )
-            return AutoGPTQQuantLinear
+        desc_act = gptq_quantization_config.desc_act
+        group_size = gptq_quantization_config.group_size
+        bits = gptq_quantization_config.bits
+        if hasattr(gptq_quantization_config, "use_exllama"):
+            use_exllama = gptq_quantization_config.use_exllama
+        else:
+            use_exllama = not gptq_quantization_config.disable_exllama
+        if hasattr(gptq_quantization_config, "exllama_config"):
+            exllama_version = gptq_quantization_config.exllama_config["version"]
+        else:
+            exllama_version = 1
+        AutoGPTQQuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=desc_act,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=not (use_exllama and exllama_version == 1),
+            disable_exllamav2=not (use_exllama and exllama_version == 2),
+        )
+        return AutoGPTQQuantLinear
     return None
 
 

From 3af469eeea8235144e18f59dfb7df31711ebe014 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 10 Nov 2023 13:33:56 +0100
Subject: [PATCH 25/65] Refactor adapter deletion (#1105)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Description

The job of deleting an adapter is now transferred to the adapter layer,
instead of the adapter model. This makes it easier for users or other
libraries who don't use the adapter model to delete adapters.

Implementation

The code should now be more generic, relying less on hard-coded
attributes.

As a precaution, I also changed the type of adapter_layer_names from
list to tuple, as it should not be mutated.

When deleting the active adapter, the logic for choosing the new active
adapter has been changed slightly to ensure consistency across layers.
In practice, this should rarely make a difference. An error is now
raised if the last remaining adapter is deleted.

Test coverage has been increased:

- Deleting adapters is now also tested for custom models.
- It is also tested for LoHa, LoKr, not only LoRA.
- I added a test for deleting the non-active adapter.

Not implemented

I did not add adapter deletion to IA³, since it is included in #980. LMK
if it should be added here instead.
---
 src/peft/tuners/adalora/layer.py        |  3 +-
 src/peft/tuners/ia3/layer.py            |  6 +-
 src/peft/tuners/ia3/model.py            |  2 +-
 src/peft/tuners/loha/layer.py           |  5 +-
 src/peft/tuners/lokr/layer.py           |  7 +-
 src/peft/tuners/lora/layer.py           |  6 +-
 src/peft/tuners/lora/model.py           | 26 ++-----
 src/peft/tuners/lycoris_utils.py        | 19 +++--
 src/peft/tuners/tuners_utils.py         | 55 ++++++++++++++-
 tests/test_custom_models.py             |  8 +++
 tests/test_decoder_models.py            |  4 ++
 tests/test_encoder_decoder_models.py    |  4 ++
 tests/test_feature_extraction_models.py |  4 ++
 tests/testing_common.py                 | 94 ++++++++++++++++++-------
 14 files changed, 175 insertions(+), 68 deletions(-)

diff --git a/src/peft/tuners/adalora/layer.py b/src/peft/tuners/adalora/layer.py
index 4f5b119f34..5777581e9b 100644
--- a/src/peft/tuners/adalora/layer.py
+++ b/src/peft/tuners/adalora/layer.py
@@ -26,7 +26,8 @@
 class AdaLoraLayer(LoraLayer):
     # List all names of layers that may contain adapter weights
     # Note: ranknum doesn't need to be included as it is not an nn.Module
-    adapter_layer_names = ["lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B"]
+    adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
+    # other_param_names is defined in LoraLayer
 
     def __init__(
         self,
diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index b4ff69cc64..cd278a450a 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -25,8 +25,10 @@
 
 
 class IA3Layer(BaseTunerLayer):
-    # List all names of layers that may contain adapter weights
-    adapter_layer_names = ["ia3_l"]
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("ia3_l",)
+    # All names of other parameters that may contain adapter-related parameters
+    other_layer_names = ("scaling",)
 
     def __init__(
         self,
diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index 3714134cc2..f18fbf6b4b 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -206,7 +206,7 @@ def _create_and_replace(
                     "New adapter should have the same value for `is_feedforward` as previously added adapter."
                 )
             if isinstance(target, torch.nn.Conv2d):
-                target.update_layer_conv2d(
+                target.update_layer(
                     adapter_name,
                     ia3_config.init_ia3_weights,
                 )
diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py
index 26f57ac681..2a8a205b02 100644
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@@ -24,8 +24,9 @@
 
 
 class LoHaLayer(LycorisLayer, nn.Module):
-    # List all names of layers that may contain adapter weights
-    adapter_layer_names = ["hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2"]
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2")
+    # other_param_names is defined on parent class
 
     def __init__(self):
         LycorisLayer.__init__(self)
diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py
index 9b01ecf96f..97f3afb6fd 100644
--- a/src/peft/tuners/lokr/layer.py
+++ b/src/peft/tuners/lokr/layer.py
@@ -24,8 +24,8 @@
 
 
 class LoKrLayer(LycorisLayer, nn.Module):
-    # List all names of layers that may contain adapter weights
-    adapter_layer_names = [
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = (
         "lokr_w1",
         "lokr_w1_a",
         "lokr_w1_b",
@@ -33,7 +33,8 @@ class LoKrLayer(LycorisLayer, nn.Module):
         "lokr_w2_a",
         "lokr_w2_b",
         "lokr_t2",
-    ]
+    )
+    # other_param_names is defined on parent class
 
     def __init__(self):
         LycorisLayer.__init__(self)
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index 0eb2efa2f2..ab9eb83fcc 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -26,8 +26,10 @@
 
 
 class LoraLayer(BaseTunerLayer):
-    # List all names of layers that may contain adapter weights
-    adapter_layer_names = ["lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B"]
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
 
     def __init__(self, in_features: int, out_features: int, **kwargs):
         self.r = {}
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 666c611c0e..85bc8b2fd4 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -661,29 +661,15 @@ def delete_adapter(self, adapter_name: str):
         del self.peft_config[adapter_name]
 
         key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        new_adapter = None
         for key in key_list:
             _, target, _ = _get_submodules(self.model, key)
             if isinstance(target, LoraLayer):
-                for attr in [
-                    "r",
-                    "lora_alpha",
-                    "scaling",
-                    "lora_A",
-                    "lora_B",
-                    "lora_embedding_A",
-                    "lora_embedding_B",
-                    "lora_dropout",
-                ]:
-                    if adapter_name in getattr(target, attr):
-                        getattr(target, attr).pop(adapter_name)
-                if adapter_name in target.active_adapters:
-                    resetting_active_adapter = (
-                        list(self.peft_config.keys())[0] if len(self.peft_config) > 0 else "default"
-                    )
-                    warnings.warn(
-                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to {resetting_active_adapter}. "
-                    )
-                    target.set_adapter(resetting_active_adapter)
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
 
     def merge_and_unload(self, progressbar: bool = False, safe_merge: bool = False):
         r"""
diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py
index 8d3fb7481b..b82c960230 100644
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@@ -62,6 +62,8 @@ class LycorisLayer(BaseTunerLayer, nn.Module):
     r"""
     A base layer for LyCORIS like adapters
     """
+    # adapter_layer_names needs to be defined on the child class
+    other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
 
     def __init__(self):
         self.r = {}
@@ -391,17 +393,12 @@ def delete_adapter(self, adapter_name: str):
         del self.peft_config[adapter_name]
 
         key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
         for key in key_list:
             _, target, _ = _get_submodules(self.model, key)
             if isinstance(target, LycorisLayer):
-                for attr in target.adapter_layer_names:
-                    if adapter_name in getattr(target, attr):
-                        getattr(target, attr).pop(adapter_name)
-                if adapter_name in target.active_adapters:
-                    resetting_active_adapter = (
-                        list(self.peft_config.keys())[0] if len(self.peft_config) > 0 else "default"
-                    )
-                    warnings.warn(
-                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to {resetting_active_adapter}. "
-                    )
-                    target.set_adapter(resetting_active_adapter)
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 8ca3abfa56..14dfe1b779 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -16,6 +16,7 @@
 
 import logging
 import re
+import warnings
 from abc import ABC, abstractmethod
 from typing import Any, Union
 
@@ -272,8 +273,10 @@ class BaseTunerLayer(ABC):
     """
     active_adapter = None
 
-    # List all names of layers that may contain adapter weights
-    adapter_layer_names: list[str] = []
+    # All names of layers that may contain adapter (trainable) weights
+    adapter_layer_names: tuple[str] = ()
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names: tuple[str] = ()
 
     # indicates whether all adapters should be disabled
     _disable_adapters: bool = False
@@ -351,6 +354,54 @@ def set_adapter(self, adapter_names: str | list[str]):
 
         self._active_adapter = adapter_names
 
+    def _all_available_adapter_names(self) -> list[str]:
+        """Return a sorted list of all available adapter names"""
+        adapter_names = set()
+        for name in self.adapter_layer_names + self.other_param_names:
+            # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter
+            # names
+            attr = getattr(self, name)
+            if hasattr(attr, "keys"):
+                adapter_names.update(attr.keys())
+        return sorted(adapter_names)
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Delete an adapter from the layer
+
+        This should be called on all adapter layers, or else we will get an inconsistent state.
+
+        This method will also set a new active adapter if the deleted adapter was an active adapter. It is important
+        that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers.
+
+        Args:
+            adapter_name (`str`): The name of the adapter to delete
+
+        """
+        for attr in self.adapter_layer_names + self.other_param_names:
+            if adapter_name in getattr(self, attr):
+                del getattr(self, attr)[adapter_name]
+
+        if adapter_name in self.active_adapters:
+            # choose a new active adapter
+            active_adapters = self.active_adapters[:]
+            active_adapters.remove(adapter_name)
+            if active_adapters:
+                self.set_adapter(active_adapters)
+            else:
+                # no active adapters left, set a new default adapter
+                # here we get the list of all adapters existing adapter names and choose the first one
+                remaining_adapters = self._all_available_adapter_names()
+                if not remaining_adapters:
+                    self.set_adapter([])
+                else:
+                    new_active_adapter = remaining_adapters[0]
+                    warnings.warn(
+                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to "
+                        f"{new_active_adapter}."
+                    )
+                    self.set_adapter(remaining_adapters[0])
+
 
 def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
     """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config.
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 9bd4dec9b6..368b8ec9aa 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -681,6 +681,14 @@ def run_with_disable(config_kwargs, bias):
             # This is bad, there was a warning about the bias when there should not have been any.
             self.fail("There should be no warning when bias is set to 'none'")
 
+    @parameterized.expand(TEST_CASES)
+    def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_adapter(model_id, config_cls, config_kwargs)
+
+    @parameterized.expand(TEST_CASES)
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
     @parameterized.expand(TEST_CASES)
     def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
         self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index ea30a8183c..bb8df694d7 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -154,6 +154,10 @@ def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwa
     def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_delete_adapter(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
         self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py
index 9acc1c63a2..e1f9cb239d 100644
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@@ -128,6 +128,10 @@ def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwa
     def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_delete_adapter(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs):
         self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs)
diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py
index 94e2c81835..ce09fc6247 100644
--- a/tests/test_feature_extraction_models.py
+++ b/tests/test_feature_extraction_models.py
@@ -146,6 +146,10 @@ def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwa
     def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs):
         self._test_delete_adapter(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs)
+
     @parameterized.expand(
         PeftTestConfigManager.get_grid_parameters(
             {
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 8bb7a104cd..0426bba8e2 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -29,6 +29,7 @@
     IA3Config,
     LoraConfig,
     PeftModel,
+    PeftType,
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptLearningConfig,
@@ -815,42 +816,87 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar
             self.assertIsNotNone(param.grad)
 
     def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
-        if issubclass(config_cls, AdaLoraConfig):
-            # AdaLora does not support adding more than 1 adapter
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR]
+        # IA3 does not support deleting adapters yet, but it just needs to be added
+        # AdaLora does not support multiple adapters
+        config = config_cls(
+            base_model_name_or_path=model_id,
+            **config_kwargs,
+        )
+        if config.peft_type not in supported_peft_types:
             return
 
         model = self.transformers_class.from_pretrained(model_id)
+        if isinstance(config.target_modules, str):
+            # TODO this should be doable
+            self.skipTest("Multiple adapters cannot currently be added when target_modules is a string.")
+
+        adapter_to_delete = "delete_me"
+        model = get_peft_model(model, config)
+        model.add_adapter(adapter_to_delete, config)
+        model.set_adapter(adapter_to_delete)
+        model = model.to(self.torch_device)
+        model.delete_adapter(adapter_to_delete)
+        self.assertFalse(adapter_to_delete in model.peft_config)
+        self.assertEqual(model.active_adapters, ["default"])
+
+        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(model, key)
+            attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr(target, "other_param_names", [])
+            for attr in attributes_to_check:
+                self.assertFalse(adapter_to_delete in getattr(target, attr))
+
+        # check that we can also delete the last remaining adapter
+        model.delete_adapter("default")
+        self.assertFalse("default" in model.peft_config)
+        self.assertEqual(model.active_adapters, [])
+
+        input = self.prepare_inputs_for_testing()
+        # note: we cannot call model(**input) because PeftModel always expects there to be at least one adapter
+        model.base_model(**input)  # should not raise an error
+
+    def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
+        # same as test_delete_adapter, but this time an inactive adapter is deleted
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR]
+        # IA3 does not support deleting adapters yet, but it just needs to be added
+        # AdaLora does not support multiple adapters
         config = config_cls(
             base_model_name_or_path=model_id,
             **config_kwargs,
         )
+        if config.peft_type not in supported_peft_types:
+            return
+
+        model = self.transformers_class.from_pretrained(model_id)
+        if isinstance(config.target_modules, str):
+            # TODO this should be doable
+            self.skipTest("Multiple adapters cannot currently be added when target_modules is a string.")
+
         adapter_to_delete = "delete_me"
         model = get_peft_model(model, config)
         model.add_adapter(adapter_to_delete, config)
-        model.set_adapter(adapter_to_delete)
+        # "delete_me" is added but not activated
         model = model.to(self.torch_device)
+        model.delete_adapter(adapter_to_delete)
+        self.assertFalse(adapter_to_delete in model.peft_config)
+        self.assertEqual(model.active_adapters, ["default"])
 
-        if config.peft_type not in ("LORA"):
-            with self.assertRaises(AttributeError):
-                model.delete_adapter(adapter_to_delete)
-        else:
-            model.delete_adapter(adapter_to_delete)
-            self.assertFalse(adapter_to_delete in model.peft_config)
-            key_list = [key for key, _ in model.named_modules() if "lora" not in key]
-            for key in key_list:
-                _, target, _ = _get_submodules(model, key)
-                if isinstance(target, LoraLayer):
-                    for attr in [
-                        "r",
-                        "lora_alpha",
-                        "scaling",
-                        "lora_A",
-                        "lora_B",
-                        "lora_embedding_A",
-                        "lora_embedding_B",
-                        "lora_dropout",
-                    ]:
-                        self.assertFalse(adapter_to_delete in getattr(target, attr))
+        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(model, key)
+            attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr(target, "other_param_names", [])
+            for attr in attributes_to_check:
+                self.assertFalse(adapter_to_delete in getattr(target, attr))
+
+        # check that we can also delete the last remaining adapter
+        model.delete_adapter("default")
+        self.assertFalse("default" in model.peft_config)
+        self.assertEqual(model.active_adapters, [])
+
+        input = self.prepare_inputs_for_testing()
+        # note: we cannot call model(**input) because PeftModel always expects there to be at least one adapter
+        model.base_model(**input)  # should not raise an error
 
     def _test_unload_adapter(self, model_id, config_cls, config_kwargs):
         model = self.transformers_class.from_pretrained(model_id)

From 49ddefa83443052ca9c0ee56d99476f16f375bdc Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Fri, 10 Nov 2023 14:21:14 +0100
Subject: [PATCH 26/65] Add num_dataloader_workers arg to dreambooth script
 (#1107)

This is especially important for Windows users, who may have to set the
number of workers to 0.
---
 docs/source/task_guides/dreambooth_lora.mdx  | 3 +++
 examples/lora_dreambooth/train_dreambooth.py | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/task_guides/dreambooth_lora.mdx b/docs/source/task_guides/dreambooth_lora.mdx
index 309986db1f..82dbaae80d 100644
--- a/docs/source/task_guides/dreambooth_lora.mdx
+++ b/docs/source/task_guides/dreambooth_lora.mdx
@@ -83,6 +83,7 @@ accelerate launch train_dreambooth.py \
   --output_dir=$OUTPUT_DIR \
   --train_text_encoder \
   --with_prior_preservation --prior_loss_weight=1.0 \
+  --num_dataloader_workers=1 \
   --instance_prompt="a photo of sks dog" \
   --class_prompt="a photo of dog" \
   --resolution=512 \
@@ -101,6 +102,8 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=800
 ```
 
+If you are running this script on Windows, you may need to set the `--num_dataloader_workers` to 0.
+
 ## Inference with a single adapter
 
 To run inference with the fine-tuned model, first specify the base model with which the fine-tuned LoRA weights will be combined:
diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index caf37960e1..3e350b0313 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -213,6 +213,10 @@ def parse_args(input_args=None):
         help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora and `train_text_encoder` are True",
     )
 
+    parser.add_argument(
+        "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
+    )
+
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -799,7 +803,7 @@ def main(args):
         batch_size=args.train_batch_size,
         shuffle=True,
         collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
-        num_workers=1,
+        num_workers=args.num_dataloader_workers,
     )
 
     # Scheduler and math around the number of training steps.

From 5d84484079ee72c92678eadb273d3fe0241ed5ea Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 10 Nov 2023 18:37:38 +0100
Subject: [PATCH 27/65] fix import issue transformers (#1116)

---
 src/peft/import_utils.py   | 18 ++++++++++++++++++
 src/peft/peft_model.py     |  2 +-
 src/peft/utils/__init__.py |  1 +
 src/peft/utils/other.py    | 30 ++++++++++++++++++++++++++++--
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
index a62525c1c4..f82d2238f1 100644
--- a/src/peft/import_utils.py
+++ b/src/peft/import_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import importlib
 import importlib.metadata as importlib_metadata
+from functools import lru_cache
 
 import packaging.version
 
@@ -46,3 +47,20 @@ def is_auto_gptq_available():
 
 def is_optimum_available() -> bool:
     return importlib.util.find_spec("optimum") is not None
+
+
+@lru_cache()
+def is_torch_tpu_available(check_device=True):
+    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
+    if importlib.util.find_spec("torch_xla") is not None:
+        if check_device:
+            # We need to check if `xla_device` can be found, will raise a RuntimeError if not
+            try:
+                import torch_xla.core.xla_model as xm
+
+                _ = xm.xla_device()
+                return True
+            except RuntimeError:
+                return False
+        return True
+    return False
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index fb4beef27d..e762740207 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -32,7 +32,6 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
-from transformers.pytorch_utils import id_tensor_storage
 from transformers.utils import PushToHubMixin
 
 from . import __version__
@@ -60,6 +59,7 @@
     _set_adapter,
     _set_trainable,
     get_peft_model_state_dict,
+    id_tensor_storage,
     infer_device,
     load_peft_weights,
     set_peft_model_state_dict,
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
index b42d8d070b..1ba150eb02 100644
--- a/src/peft/utils/__init__.py
+++ b/src/peft/utils/__init__.py
@@ -45,6 +45,7 @@
     infer_device,
     get_auto_gptq_quant_linear,
     get_quantization_config,
+    id_tensor_storage,
 )
 from .hub_utils import hub_file_exists
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index c879a45a0f..4f64fa4487 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -15,14 +15,15 @@
 import copy
 import inspect
 import warnings
-from typing import Optional
+from typing import Optional, Tuple
 
 import accelerate
 import torch
 from accelerate.hooks import add_hook_to_module, remove_hook_from_module
 from accelerate.utils import is_npu_available, is_xpu_available
+from safetensors.torch import storage_ptr, storage_size
 
-from ..import_utils import is_auto_gptq_available
+from ..import_utils import is_auto_gptq_available, is_torch_tpu_available
 
 
 # Get current device name based on available devices
@@ -438,6 +439,31 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
     return None
 
 
+def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+    """
+    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
+    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
+    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
+    non-overlapping lifetimes may have the same id.
+
+    This method is the exact same copy of
+    https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added
+    it here manually to avoid import issue with old versions of transformers.
+    """
+    if tensor.device.type == "xla" and is_torch_tpu_available():
+        # NOTE: xla tensors dont have storage
+        # use some other unique id to distinguish.
+        # this is a XLA tensor, it must be created using torch_xla's
+        # device. So the following import is safe:
+        import torch_xla
+
+        unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor)
+    else:
+        unique_id = storage_ptr(tensor)
+
+    return tensor.device, unique_id, storage_size(tensor)
+
+
 TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
     "t5": ["q", "v"],
     "mt5": ["q", "v"],

From b25ce8a0cdf25e0568949bc67d169b905bd5eb8b Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 13 Nov 2023 07:52:30 +0100
Subject: [PATCH 28/65] Correctly deal with  `ModulesToSaveWrapper` when using
 Low-level API (#1112)

* correctly deal with  `ModulesToSaveWrapper`

* style

* fix tests (#1117)
---
 src/peft/tuners/tuners_utils.py | 27 ++++++++++++++++++++++++++-
 tests/test_low_level_api.py     | 26 ++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 14dfe1b779..004352237f 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -25,7 +25,7 @@
 from peft.utils import COMMON_LAYERS_PATTERN
 
 from ..config import PeftConfig
-from ..utils import _get_submodules
+from ..utils import ModulesToSaveWrapper, _get_submodules
 
 
 logger = logging.getLogger(__name__)
@@ -211,6 +211,9 @@ def inject_adapter(self, model: nn.Module, adapter_name: str):
         is_target_modules_in_base_model = False
         key_list = [key for key, _ in model.named_modules()]
 
+        _check_for_modules_to_save = getattr(peft_config, "modules_to_save", None) is not None
+        _has_modules_to_save = False
+
         model_config = getattr(model, "config", {"model_type": "custom"})
         if hasattr(model_config, "to_dict"):
             model_config = model_config.to_dict()
@@ -218,6 +221,22 @@ def inject_adapter(self, model: nn.Module, adapter_name: str):
         peft_config = self._prepare_adapter_config(peft_config, model_config)
 
         for key in key_list:
+            # Check for modules_to_save in case
+            if _check_for_modules_to_save and any(
+                key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save
+            ):
+                # Optionally set the modules to save
+                parent, target, target_name = _get_submodules(model, key)
+
+                if not isinstance(target, ModulesToSaveWrapper):
+                    new_module = ModulesToSaveWrapper(target, adapter_name)
+                    setattr(parent, target_name, new_module)
+                else:
+                    target.update(adapter_name)
+
+                _has_modules_to_save = True
+                continue
+
             if not self._check_target_module_exists(peft_config, key):
                 continue
 
@@ -244,6 +263,12 @@ def inject_adapter(self, model: nn.Module, adapter_name: str):
                 if adapter_name in n:
                     p.requires_grad = False
 
+        if _has_modules_to_save:
+            if not hasattr(model, "modules_to_save"):
+                model.modules_to_save = set(peft_config.modules_to_save)
+            else:
+                model.modules_to_save.update(set(peft_config.modules_to_save))
+
     def merge_adapter(self):
         """
         This method merges the LoRa layers into the base model.
diff --git a/tests/test_low_level_api.py b/tests/test_low_level_api.py
index d03ded0393..4ab1dd7203 100644
--- a/tests/test_low_level_api.py
+++ b/tests/test_low_level_api.py
@@ -19,6 +19,7 @@
 import torch
 
 from peft import LoraConfig, get_peft_model_state_dict, inject_adapter_in_model
+from peft.utils import ModulesToSaveWrapper
 
 
 class DummyModel(torch.nn.Module):
@@ -63,3 +64,28 @@ def test_get_peft_model_state_dict(self):
 
         for key in peft_state_dict.keys():
             self.assertTrue("lora" in key)
+
+    def test_modules_to_save(self):
+        self.model = DummyModel()
+
+        lora_config = LoraConfig(
+            lora_alpha=16,
+            lora_dropout=0.1,
+            r=64,
+            bias="none",
+            target_modules=["linear"],
+            modules_to_save=["embedding"],
+        )
+
+        self.model = inject_adapter_in_model(lora_config, self.model)
+
+        for name, module in self.model.named_modules():
+            if name == "linear":
+                self.assertTrue(hasattr(module, "lora_A"))
+                self.assertTrue(hasattr(module, "lora_B"))
+            elif name == "embedding":
+                self.assertTrue(isinstance(module, ModulesToSaveWrapper))
+
+        state_dict = get_peft_model_state_dict(self.model)
+
+        self.assertTrue("embedding.weight" in state_dict.keys())

From 79298c7c24abbde8927e917eb0f340b55b83122d Mon Sep 17 00:00:00 2001
From: ChG <chenhegu0109@gmail.com>
Date: Mon, 13 Nov 2023 01:48:50 -0800
Subject: [PATCH 29/65] fix doc typo (#1121)

---
 docs/source/task_guides/dreambooth_lora.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/task_guides/dreambooth_lora.mdx b/docs/source/task_guides/dreambooth_lora.mdx
index 82dbaae80d..fe734b304b 100644
--- a/docs/source/task_guides/dreambooth_lora.mdx
+++ b/docs/source/task_guides/dreambooth_lora.mdx
@@ -174,7 +174,7 @@ image.save("DESTINATION_PATH_FOR_THE_IMAGE")
 ## Multi-adapter inference
 
 With PEFT you can combine multiple adapters for inference. In the previous example you have fine-tuned Stable Diffusion on 
-some dog images. The pipeline created based on these weights got a name - `adapter_name="dog`. Now, suppose you also fine-tuned 
+some dog images. The pipeline created based on these weights got a name - `adapter_name="dog"`. Now, suppose you also fine-tuned 
 this base model on images of a crochet toy. Let's see how we can use both adapters. 
 
 First, you'll need to perform all the steps as in the single adapter inference example:

From f020404ee60dbf46415bd65414ba4d1a3e3a8992 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Tue, 14 Nov 2023 11:13:21 +0530
Subject: [PATCH 30/65] Release: v0.6.2 (#1125)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 1418afa46a..b626882896 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.2.dev0",
+    version="0.6.2",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 4a5ecb7de1..24a0635644 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.2.dev0"
+__version__ = "0.6.2"
 
 from .auto import (
     AutoPeftModel,

From 94877b5008ea934ef49f35326f927f69d53580c6 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Tue, 14 Nov 2023 14:59:55 +0530
Subject: [PATCH 31/65] Release: v0.6.3.dev0 (#1128)

---
 setup.py             | 2 +-
 src/peft/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b626882896..975c9c36f8 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="peft",
-    version="0.6.2",
+    version="0.6.3.dev0",
     description="Parameter-Efficient Fine-Tuning (PEFT)",
     license_files=["LICENSE"],
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 24a0635644..a3ce332f24 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.6.2"
+__version__ = "0.6.3.dev0"
 
 from .auto import (
     AutoPeftModel,

From ad756173f126e54ccd9954a5ffd4571f08363e4d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 14 Nov 2023 10:30:52 +0100
Subject: [PATCH 32/65] FIX: Adding 2 adapters when target_modules is a str
 fails (#1111)

* Fix adding 2 adapters when target_modules is a str

Problem description

Adding two adapters (e.g. LoRA) when using a list for `target_mdules`
works but passing a str fails. The issue is that for str, we do a
`re.fullmatch`, whereas for list, we just check `endswith`. After adding
the first adapter, though, the naming pattern of the modules changes. In
the example above, the name for the linear layer changes from `"lin0"`
to `"base_model.model.lin0"`, which is why the `fullmatch` fails but the
`endswith` still works.

Reproduction

from peft import LoraConfig, get_peft_model
from torch import nn

class MLP(nn.Module):
    def __init__(self, bias=True):
        super().__init__()
        self.lin0 = nn.Linear(10, 20, bias=bias)

def test_target_modules_list():
    config = LoraConfig(target_modules=["lin0"])
    test_it(config)
    print("Adding two adapters with target_module being a list works")

def test_target_modules_str():
    config = LoraConfig(target_modules="lin0")
    test_it(config)

def test_it(config):
    model = MLP()
    model = get_peft_model(model, config, "adapter0")
    model.add_adapter("adapter1", config)
    print("Adding two adapters with target_module being a str works")

if __name__ == "__main__":
    # works
    test_target_modules_list()
    # ValueError: Target modules lin0 not found in the base model
    test_target_modules_str()

I think that most users would be surprised that:

1. Adding the first adapter works but adding the second fails, even
   though they use the same config.
2. Using `target_modules=["lin0"]` works but `target_modules="lin0"`
   fails for the 2nd adapter.

Solution

We could change the logic of not using `re.fullmatch` for str, but I
think that could be tricky to achieve without breaking BC. Instead, I
chose to change the inject_adapter call in add_adapter to pass the base
model, not the whole peft model. This way, the naming pattern is
preserved.

Tests

I haven't added extra tests for this. The script above could serve as a
test. However, it will be sufficient to remove the guard added in #1105:

    if isinstance(config.target_str, modules):
        # TODO this should be doable
        self.skipTest("Multiple adapters cannot currently be added when target_modules is a string.")

as that will test exactly this behavior and was how the bug was
originally uncovered. Depending on what PR lands first, the guard has to
removed in this PR or in #1105.

* Enable tests for adding 2 adapters with str
---
 src/peft/peft_model.py  | 2 +-
 tests/testing_common.py | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index e762740207..ef66f0cf1c 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -573,7 +573,7 @@ def add_adapter(self, adapter_name: str, peft_config: PeftConfig):
                 self.base_model.add_adapter(adapter_name, peft_config)
             else:
                 self.peft_config[adapter_name] = peft_config
-                self.base_model.inject_adapter(self, adapter_name)
+                self.base_model.inject_adapter(self.base_model.model, adapter_name)
         except Exception:  # somthing went wrong, roll back
             if adapter_name in self.peft_config:
                 del self.peft_config[adapter_name]
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 0426bba8e2..a57ab839b2 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -827,10 +827,6 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
             return
 
         model = self.transformers_class.from_pretrained(model_id)
-        if isinstance(config.target_modules, str):
-            # TODO this should be doable
-            self.skipTest("Multiple adapters cannot currently be added when target_modules is a string.")
-
         adapter_to_delete = "delete_me"
         model = get_peft_model(model, config)
         model.add_adapter(adapter_to_delete, config)
@@ -869,10 +865,6 @@ def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
             return
 
         model = self.transformers_class.from_pretrained(model_id)
-        if isinstance(config.target_modules, str):
-            # TODO this should be doable
-            self.skipTest("Multiple adapters cannot currently be added when target_modules is a string.")
-
         adapter_to_delete = "delete_me"
         model = get_peft_model(model, config)
         model.add_adapter(adapter_to_delete, config)

From d350a00ece47bda6461a2f9999b502f046d9a2e2 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:28:55 +0100
Subject: [PATCH 33/65] Prompt tuning: fix AutoTokenizer.from_pretrained
 (#1053)

Fixes #1032

Description

Currently, when using prompt tuning with TEXT, we call
AutoTokenizer.from_pretrained with only the model id. However, it may be
necessary to pass additional arguments, e.g. trust_remote_code=True.
This fix allows to pass more arguments by setting the argument
tokenizer_kwargs in the PromptTuningConfig.

I also added a check that when tokenizer_kwargs is set, the TEXT option
is actually being used.

Moreover, I noticed that we have no tests for prompt tuning with TEXT,
so I added those tests for decoder models.

Additional changes

There was a bug in PromptEmbedding where the device of the
init_token_ids was not set, which resulted in errors when using CUDA.

Finally, I removed an unused constant CONFIG_CLASSES from a test.
---
 src/peft/tuners/prompt_tuning/config.py | 16 +++++++
 src/peft/tuners/prompt_tuning/model.py  |  6 ++-
 tests/test_decoder_models.py            | 60 ++++++++++++++++++++++++-
 tests/testing_common.py                 |  7 ---
 4 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/src/peft/tuners/prompt_tuning/config.py b/src/peft/tuners/prompt_tuning/config.py
index 181ccd793f..46df189673 100644
--- a/src/peft/tuners/prompt_tuning/config.py
+++ b/src/peft/tuners/prompt_tuning/config.py
@@ -37,6 +37,9 @@ class PromptTuningConfig(PromptLearningConfig):
             The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`.
         tokenizer_name_or_path (`str`, *optional*):
             The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`.
+        tokenizer_kwargs (`dict`, *optional*):
+            The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if `prompt_tuning_init` is
+            `TEXT`.
     """
 
     prompt_tuning_init: Union[PromptTuningInit, str] = field(
@@ -56,5 +59,18 @@ class PromptTuningConfig(PromptLearningConfig):
         },
     )
 
+    tokenizer_kwargs: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if prompt_tuning_init is "
+                "`TEXT`"
+            ),
+        },
+    )
+
     def __post_init__(self):
         self.peft_type = PeftType.PROMPT_TUNING
+
+        if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT):
+            raise ValueError(f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT}'.")
diff --git a/src/peft/tuners/prompt_tuning/model.py b/src/peft/tuners/prompt_tuning/model.py
index 6a6442cc65..9d273a4a9f 100644
--- a/src/peft/tuners/prompt_tuning/model.py
+++ b/src/peft/tuners/prompt_tuning/model.py
@@ -66,7 +66,8 @@ def __init__(self, config, word_embeddings):
         if config.prompt_tuning_init == PromptTuningInit.TEXT:
             from transformers import AutoTokenizer
 
-            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path)
+            tokenizer_kwargs = config.tokenizer_kwargs or {}
+            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path, **tokenizer_kwargs)
             init_text = config.prompt_tuning_init_text
             init_token_ids = tokenizer(init_text)["input_ids"]
             # Trim or iterate until num_text_tokens matches total_virtual_tokens
@@ -77,8 +78,9 @@ def __init__(self, config, word_embeddings):
                 num_reps = math.ceil(total_virtual_tokens / num_text_tokens)
                 init_token_ids = init_token_ids * num_reps
             init_token_ids = init_token_ids[:total_virtual_tokens]
+            init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)
 
-            word_embedding_weights = word_embeddings(torch.LongTensor(init_token_ids)).detach().clone()
+            word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
             word_embedding_weights = word_embedding_weights.to(torch.float32)
             self.embedding.weight = torch.nn.Parameter(word_embedding_weights)
 
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index bb8df694d7..3afd85c015 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -13,12 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
+from unittest.mock import Mock, call, patch
 
 import torch
 from parameterized import parameterized
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from peft import AdaLoraConfig
+from peft import AdaLoraConfig, PromptTuningConfig, PromptTuningInit, get_peft_model
 
 from .testing_common import PeftCommonTester, PeftTestConfigManager
 
@@ -76,6 +77,61 @@ def test_adapter_name(self, test_name, model_id, config_cls, config_kwargs):
     def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls, config_kwargs):
         self._test_prepare_for_training(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_prompt_tuning_text_prepare_for_training(self, test_name, model_id, config_cls, config_kwargs):
+        # Test that prompt tuning works with text init
+        if config_cls != PromptTuningConfig:
+            return
+
+        config_kwargs = config_kwargs.copy()
+        config_kwargs["prompt_tuning_init"] = PromptTuningInit.TEXT
+        config_kwargs["prompt_tuning_init_text"] = "This is a test prompt."
+        config_kwargs["tokenizer_name_or_path"] = model_id
+        self._test_prepare_for_training(model_id, config_cls, config_kwargs)
+
+    def test_prompt_tuning_text_tokenizer_kwargs(self):
+        # Allow users to pass additional arguments to Tokenizer.from_pretrained
+        # Fix for #1032
+        mock = Mock()
+        orig_from_pretrained = AutoTokenizer.from_pretrained
+
+        def mock_autotokenizer_from_pretrained(*args, **kwargs):
+            mock(*args, **kwargs)
+            return orig_from_pretrained(config.tokenizer_name_or_path)
+
+        model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
+        config = PromptTuningConfig(
+            base_model_name_or_path=model_id,
+            tokenizer_name_or_path=model_id,
+            num_virtual_tokens=10,
+            prompt_tuning_init=PromptTuningInit.TEXT,
+            task_type="CAUSAL_LM",
+            prompt_tuning_init_text="This is a test prompt.",
+            tokenizer_kwargs={"trust_remote_code": True, "foo": "bar"},
+        )
+        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        with patch("transformers.AutoTokenizer.from_pretrained", mock_autotokenizer_from_pretrained):
+            model = get_peft_model(model, config)
+
+        expected_call = call(model_id, trust_remote_code=True, foo="bar")
+        self.assertEqual(mock.call_args, expected_call)
+
+    def test_prompt_tuning_config_invalid_args(self):
+        # Raise an error when tokenizer_kwargs is used with prompt_tuning_init!='TEXT', because this argument has no
+        # function in that case
+        model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
+        msg = "tokenizer_kwargs only valid when using prompt_tuning_init='TEXT'."
+        with self.assertRaisesRegex(ValueError, expected_regex=msg):
+            PromptTuningConfig(
+                base_model_name_or_path=model_id,
+                tokenizer_name_or_path=model_id,
+                num_virtual_tokens=10,
+                task_type="CAUSAL_LM",
+                prompt_tuning_init_text="This is a test prompt.",
+                prompt_tuning_init=PromptTuningInit.RANDOM,  # <= should not be used together with tokenizer_kwargs
+                tokenizer_kwargs={"trust_remote_code": True, "foo": "bar"},
+            )
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs)
diff --git a/tests/testing_common.py b/tests/testing_common.py
index a57ab839b2..3fa7da6163 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -44,13 +44,6 @@
 from .testing_utils import get_state_dict
 
 
-CONFIG_CLASSES = (
-    IA3Config,
-    LoraConfig,
-    PrefixTuningConfig,
-    PromptEncoderConfig,
-    PromptTuningConfig,
-)
 CONFIG_TESTING_KWARGS = (
     # IA³
     {

From 98429b81840a0d75ed85d46c5aed71245bce992b Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Tue, 14 Nov 2023 12:34:32 +0100
Subject: [PATCH 34/65] Fix: TorchTracemalloc ruins Windows performance (#1126)

* feat: added tracemalloc arg to train_dreambooth

* fix: added help for arg

* fix: changed arg name

* fix formatting

* fix: import order
---
 examples/lora_dreambooth/train_dreambooth.py | 44 +++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index 3e350b0313..73d827a4c3 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -7,6 +7,7 @@
 import os
 import threading
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 
@@ -217,6 +218,13 @@ def parse_args(input_args=None):
         "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
     )
 
+    parser.add_argument(
+        "--no_tracemalloc",
+        default=False,
+        action="store_true",
+        help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.",
+    )
+
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -897,7 +905,7 @@ def main(args):
         unet.train()
         if args.train_text_encoder:
             text_encoder.train()
-        with TorchTracemalloc() as tracemalloc:
+        with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc:
             for step, batch in enumerate(train_dataloader):
                 # Skip steps until we reach the resumed step
                 if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
@@ -1038,23 +1046,29 @@ def main(args):
                 if global_step >= args.max_train_steps:
                     break
         # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
-        accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
-        accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
-        accelerator.print(
-            "GPU Total Peak Memory consumed during the train (max): {}".format(
-                tracemalloc.peaked + b2mb(tracemalloc.begin)
+
+        if not args.no_tracemalloc:
+            accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+            accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+            accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
+            accelerator.print(
+                "GPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.peaked + b2mb(tracemalloc.begin)
+                )
             )
-        )
 
-        accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
-        accelerator.print("CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used))
-        accelerator.print("CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked))
-        accelerator.print(
-            "CPU Total Peak Memory consumed during the train (max): {}".format(
-                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+            accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
+            accelerator.print(
+                "CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used)
+            )
+            accelerator.print(
+                "CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked)
+            )
+            accelerator.print(
+                "CPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+                )
             )
-        )
 
     # Create the pipeline using using the trained modules and save it.
     accelerator.wait_for_everyone()

From 18773290938fc632c42ac49f462ab34bd1abd3ea Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 14 Nov 2023 13:14:49 +0100
Subject: [PATCH 35/65] TST Improve requires grad testing: (#1131)

Previously, the corresponding tests were testing only whether specific
parameters had requires_grad True or False. Now, all parameters are
being checked. This is more rigorous.

Also, tests for Embedding, Conv1D, Conv2d were added, thus superseding
PR #1115.

Finally, tests for LoHa and LoKr were added.

Note

I considered moving the tests to a separate module, as they were getting
quite big and this would help with readability. For now, I left them in
the same module because it leads to a better diff view and is thus
easier to review. LMK if I should move the tests to a separate file.
---
 tests/test_custom_models.py | 690 ++++++++++++++++++++++++++++--------
 1 file changed, 533 insertions(+), 157 deletions(-)

diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 368b8ec9aa..ca3f1cc48a 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -941,14 +941,25 @@ class RequiresGradTester(unittest.TestCase):
 
     """
 
+    def check_requires_grad(self, model, *params_expected: str):
+        # Check that only the given parameters have requires_grad=True, and all others have requires_grad=False.
+        # Calling without arguments besides the model means that all parameters should have requires_grad=False.
+        params_with_requires_grad = [name for name, param in model.named_parameters() if param.requires_grad]
+        diff = set(params_expected).symmetric_difference(set(params_with_requires_grad))
+        msg = f"Expected {params_expected} to require gradients, got {params_with_requires_grad}"
+        self.assertEqual(len(diff), 0, msg=msg)
+
     def test_requires_grad_modules_to_save_default(self):
         config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
         peft_model = get_peft_model(MLP(), config)
 
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.default.weight",
+            "base_model.model.lin1.modules_to_save.default.bias",
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
     def test_requires_grad_modules_to_save_disabling(self):
         config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
@@ -956,30 +967,38 @@ def test_requires_grad_modules_to_save_disabling(self):
 
         # when disabling the adapter, the original module's grad should be enabled and vice versa
         peft_model.disable_adapter_layers()
-        self.assertFalse(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertTrue(peft_model.model.lin1.original_module.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.original_module.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.original_module.weight",
+            "base_model.model.lin1.original_module.bias",
+        )
 
         # when re-enabling the adapter, the original module's grad should be disabled and vice versa
         peft_model.enable_adapter_layers()
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.default.weight",
+            "base_model.model.lin1.modules_to_save.default.bias",
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # when using the disable_adapter context, the original module's grad should be enabled and vice versa
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-            self.assertTrue(peft_model.model.lin1.original_module.weight.requires_grad)
-            self.assertTrue(peft_model.model.lin1.original_module.bias.requires_grad)
+            self.check_requires_grad(
+                peft_model,
+                "base_model.model.lin1.original_module.weight",
+                "base_model.model.lin1.original_module.bias",
+            )
 
         # after context is exited, return to the previous state
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.original_module.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.default.weight",
+            "base_model.model.lin1.modules_to_save.default.bias",
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
     def test_requires_grad_modules_to_save_multiple_adapters(self):
         config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
@@ -989,24 +1008,33 @@ def test_requires_grad_modules_to_save_multiple_adapters(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.adapter1.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.default.weight",
+            "base_model.model.lin1.modules_to_save.default.bias",
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.adapter1.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.default.weight",
+            "base_model.model.lin1.modules_to_save.default.bias",
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # set config1 as active, should lead to adapter1 requiring grad
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin1.modules_to_save.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.modules_to_save.default.bias.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.adapter1.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.modules_to_save.adapter1.bias.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.modules_to_save.adapter1.weight",
+            "base_model.model.lin1.modules_to_save.adapter1.bias",
+            "base_model.model.lin0.lora_A.adapter1.weight",
+            "base_model.model.lin0.lora_B.adapter1.weight",
+        )
 
     def test_requires_grad_lora_different_targets(self):
         # test two different LoRA adapters that target different modules
@@ -1017,38 +1045,38 @@ def test_requires_grad_lora_different_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_A.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_A.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_A.adapter1.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lora_A.adapter1.weight",
+            "base_model.model.lin1.lora_B.adapter1.weight",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin1.lora_A.adapter1.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin1.lora_B.adapter1.weight.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
-        peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_A.adapter1.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lora_A.adapter1.weight",
+            "base_model.model.lin1.lora_B.adapter1.weight",
+        )
 
     def test_requires_grad_lora_same_targets(self):
         # same as previous test, except that LoRA adapters target the same layer
@@ -1059,38 +1087,38 @@ def test_requires_grad_lora_same_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_A.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_A.adapter1.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default.weight",
+            "base_model.model.lin0.lora_B.default.weight",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_A.adapter1.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1.weight",
+            "base_model.model.lin0.lora_B.adapter1.weight",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_A.adapter1.weight.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.adapter1.weight.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
-        peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.weight.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_A.adapter1.weight.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.adapter1.weight.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1.weight",
+            "base_model.model.lin0.lora_B.adapter1.weight",
+        )
 
     def test_requires_grad_ia3_different_targets(self):
         # test two different IA3 adapters that target different modules
@@ -1101,28 +1129,34 @@ def test_requires_grad_ia3_different_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertFalse(peft_model.model.lin1.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.default",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertFalse(peft_model.model.lin1.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.default",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertTrue(peft_model.model.lin1.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.ia3_l.adapter1",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-            self.assertFalse(peft_model.model.lin1.ia3_l.adapter1.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
-        peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertTrue(peft_model.model.lin1.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.ia3_l.adapter1",
+        )
 
     def test_requires_grad_ia3_same_targets(self):
         # same as previous test, except that IA3 adapters target the same layer
@@ -1133,28 +1167,34 @@ def test_requires_grad_ia3_same_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.default",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.default",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.ia3_l.adapter1.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
-        peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.ia3_l.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.ia3_l.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
 
     def test_requires_grad_adalora_different_targets(self):
         # test two different AdaLora adapters that target different modules
@@ -1165,48 +1205,42 @@ def test_requires_grad_adalora_different_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_A.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_B.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default",
+            "base_model.model.lin0.lora_B.default",
+            "base_model.model.lin0.lora_E.default",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_A.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_B.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin1.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default",
+            "base_model.model.lin0.lora_B.default",
+            "base_model.model.lin0.lora_E.default",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_A.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_B.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lora_A.adapter1",
+            "base_model.model.lin1.lora_B.adapter1",
+            "base_model.model.lin1.lora_E.adapter1",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-            self.assertFalse(peft_model.model.lin1.lora_A.adapter1.requires_grad)
-            self.assertFalse(peft_model.model.lin1.lora_B.adapter1.requires_grad)
-            self.assertFalse(peft_model.model.lin1.lora_E.adapter1.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
-        peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_A.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_B.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin1.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lora_A.adapter1",
+            "base_model.model.lin1.lora_B.adapter1",
+            "base_model.model.lin1.lora_E.adapter1",
+        )
 
     def test_requires_grad_adalora_same_targets(self):
         # same as previous test, except that AdaLora adapters target the same layer
@@ -1217,45 +1251,387 @@ def test_requires_grad_adalora_same_targets(self):
         peft_model.add_adapter("adapter1", config1)
 
         # active adapter is still "default"
-        self.assertTrue(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_A.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default",
+            "base_model.model.lin0.lora_B.default",
+            "base_model.model.lin0.lora_E.default",
+        )
 
         # set config0 as active, should not change anything
         peft_model.set_adapter("default")
-        self.assertTrue(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_A.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.adapter1.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.default",
+            "base_model.model.lin0.lora_B.default",
+            "base_model.model.lin0.lora_E.default",
+        )
 
         # change activate adapter to adapter1
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_A.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1",
+            "base_model.model.lin0.lora_B.adapter1",
+            "base_model.model.lin0.lora_E.adapter1",
+        )
 
         # disable all adapters
         with peft_model.disable_adapter():
-            self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_A.adapter1.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_B.adapter1.requires_grad)
-            self.assertFalse(peft_model.model.lin0.lora_E.adapter1.requires_grad)
+            self.check_requires_grad(peft_model)
 
         # after context is exited, return to the previous state
         peft_model.set_adapter("adapter1")
-        self.assertFalse(peft_model.model.lin0.lora_A.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_B.default.requires_grad)
-        self.assertFalse(peft_model.model.lin0.lora_E.default.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_A.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_B.adapter1.requires_grad)
-        self.assertTrue(peft_model.model.lin0.lora_E.adapter1.requires_grad)
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1",
+            "base_model.model.lin0.lora_B.adapter1",
+            "base_model.model.lin0.lora_E.adapter1",
+        )
+
+    def test_requires_grad_lora_conv2d(self):
+        # test two different LoRA adapters that target different modules
+        config0 = LoraConfig(target_modules=["conv2d"])
+        peft_model = get_peft_model(ModelConv2D(), config0)
+
+        config1 = LoraConfig(target_modules=["lin0"])
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv2d.lora_A.default.weight",
+            "base_model.model.conv2d.lora_B.default.weight",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv2d.lora_A.default.weight",
+            "base_model.model.conv2d.lora_B.default.weight",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1.weight",
+            "base_model.model.lin0.lora_B.adapter1.weight",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lora_A.adapter1.weight",
+            "base_model.model.lin0.lora_B.adapter1.weight",
+        )
+
+    def test_requires_grad_lora_emb_conv1d(self):
+        # test two different LoRA adapters that target different modules
+        config0 = LoraConfig(target_modules=["conv1d"])
+        peft_model = get_peft_model(ModelEmbConv1D(), config0)
+
+        config1 = LoraConfig(target_modules=["emb"])
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv1d.lora_A.default.weight",
+            "base_model.model.conv1d.lora_B.default.weight",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv1d.lora_A.default.weight",
+            "base_model.model.conv1d.lora_B.default.weight",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.emb.lora_embedding_A.adapter1",
+            "base_model.model.emb.lora_embedding_B.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.emb.lora_embedding_A.adapter1",
+            "base_model.model.emb.lora_embedding_B.adapter1",
+        )
+
+    def test_requires_grad_ia3_conv1d(self):
+        # test two different LoRA adapters that target different modules
+        config0 = IA3Config(target_modules=["conv1d"], feedforward_modules=[])
+        peft_model = get_peft_model(ModelEmbConv1D(), config0)
+
+        config1 = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"])
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv1d.ia3_l.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv1d.ia3_l.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
+
+    def test_requires_grad_ia3_conv2d(self):
+        # test two different LoRA adapters that target different modules
+        config0 = IA3Config(target_modules=["conv2d"], feedforward_modules=["conv2d"])
+        peft_model = get_peft_model(ModelConv2D(), config0)
+
+        config1 = IA3Config(target_modules=["lin0"], feedforward_modules=[])
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv2d.ia3_l.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.conv2d.ia3_l.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.ia3_l.adapter1",
+        )
+
+    def test_requires_grad_loha_different_targets(self):
+        # test two different LoHa adapters that target different modules
+        config0 = LoHaConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = LoHaConfig(target_modules=["lin1"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active pter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.default",
+            "base_model.model.lin0.hada_w1_b.default",
+            "base_model.model.lin0.hada_w2_a.default",
+            "base_model.model.lin0.hada_w2_b.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.default",
+            "base_model.model.lin0.hada_w1_b.default",
+            "base_model.model.lin0.hada_w2_a.default",
+            "base_model.model.lin0.hada_w2_b.default",
+        )
+
+        # change activate pter to pter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.hada_w1_a.adapter1",
+            "base_model.model.lin1.hada_w1_b.adapter1",
+            "base_model.model.lin1.hada_w2_a.adapter1",
+            "base_model.model.lin1.hada_w2_b.adapter1",
+        )
+
+        # disable all pters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.hada_w1_a.adapter1",
+            "base_model.model.lin1.hada_w1_b.adapter1",
+            "base_model.model.lin1.hada_w2_a.adapter1",
+            "base_model.model.lin1.hada_w2_b.adapter1",
+        )
+
+    def test_requires_grad_loha_same_targets(self):
+        # same as previous test, except that LoHa adapters target the same layer
+        config0 = LoHaConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = LoHaConfig(target_modules=["lin0"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.default",
+            "base_model.model.lin0.hada_w1_b.default",
+            "base_model.model.lin0.hada_w2_a.default",
+            "base_model.model.lin0.hada_w2_b.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.default",
+            "base_model.model.lin0.hada_w1_b.default",
+            "base_model.model.lin0.hada_w2_a.default",
+            "base_model.model.lin0.hada_w2_b.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.adapter1",
+            "base_model.model.lin0.hada_w1_b.adapter1",
+            "base_model.model.lin0.hada_w2_a.adapter1",
+            "base_model.model.lin0.hada_w2_b.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.hada_w1_a.adapter1",
+            "base_model.model.lin0.hada_w1_b.adapter1",
+            "base_model.model.lin0.hada_w2_a.adapter1",
+            "base_model.model.lin0.hada_w2_b.adapter1",
+        )
+
+    def test_requires_grad_lokr_different_targets(self):
+        # test two different LoKr adapters that target different modules
+        config0 = LoKrConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = LoKrConfig(target_modules=["lin1"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active pter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.default",
+            "base_model.model.lin0.lokr_w2.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.default",
+            "base_model.model.lin0.lokr_w2.default",
+        )
+
+        # change activate pter to pter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lokr_w1.adapter1",
+            "base_model.model.lin1.lokr_w2.adapter1",
+        )
+
+        # disable all pters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.lokr_w1.adapter1",
+            "base_model.model.lin1.lokr_w2.adapter1",
+        )
+
+    def test_requires_grad_lokr_same_targets(self):
+        # same as previous test, except that LoKr adapters target the same layer
+        config0 = LoKrConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = LoKrConfig(target_modules=["lin0"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.default",
+            "base_model.model.lin0.lokr_w2.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.default",
+            "base_model.model.lin0.lokr_w2.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.adapter1",
+            "base_model.model.lin0.lokr_w2.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.lokr_w1.adapter1",
+            "base_model.model.lin0.lokr_w2.adapter1",
+        )

From 3ff90626b6c4ec5c611392298e0f0339132bcc24 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 15 Nov 2023 11:21:23 +0100
Subject: [PATCH 36/65] FEAT: Make safe serialization the default one (#1088)

* make safe serialization the default one

* adapt tests

* fix final tests'

* adapt from suggestion
---
 src/peft/peft_model.py                |  2 +-
 tests/test_adaption_prompt.py         | 57 ++++++++++++++++++++++++---
 tests/test_custom_models.py           |  4 ++
 tests/test_decoder_models.py          |  8 ++++
 tests/test_encoder_decoder_models.py  |  8 ++++
 tests/test_multitask_prompt_tuning.py | 47 +++++++++++++++++++++-
 tests/testing_common.py               | 38 +++++++++++-------
 7 files changed, 142 insertions(+), 22 deletions(-)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index ef66f0cf1c..e0f0977e28 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -157,7 +157,7 @@ def peft_config(self, value: Dict[str, PeftConfig]):
     def save_pretrained(
         self,
         save_directory: str,
-        safe_serialization: bool = False,
+        safe_serialization: bool = True,
         selected_adapters: Optional[List[str]] = None,
         **kwargs: Any,
     ):
diff --git a/tests/test_adaption_prompt.py b/tests/test_adaption_prompt.py
index d3f2e74140..117c43a427 100644
--- a/tests/test_adaption_prompt.py
+++ b/tests/test_adaption_prompt.py
@@ -115,6 +115,51 @@ def make_inputs_require_grad(module, input, output):
 
         self.assertTrue(dummy_output.requires_grad)
 
+    def test_save_pretrained_regression(self) -> None:
+        seed = 420
+        torch.manual_seed(seed)
+        model = LlamaForCausalLM(self._create_test_llama_config())
+        config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM")
+        model = get_peft_model(model, config)
+        model = model.to(self.torch_device)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            model.save_pretrained(tmp_dirname, safe_serialization=False)
+
+            torch.manual_seed(seed)
+            model_from_pretrained = LlamaForCausalLM(self._create_test_llama_config())
+            model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
+
+            # check if the state dicts are equal
+            state_dict = get_peft_model_state_dict(model)
+            state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained)
+
+            # check if same keys
+            self.assertEqual(state_dict.keys(), state_dict_from_pretrained.keys())
+
+            # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate).
+            self.assertEqual(len(list(state_dict.keys())), 4)
+
+            # check if tensors equal
+            for key in state_dict.keys():
+                self.assertTrue(
+                    torch.allclose(
+                        state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device)
+                    )
+                )
+
+            # check if `adapter_model.bin` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
+
+            # check if `adapter_config.json` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
+
+            # check if `model.safetensors` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "model.safetensors")))
+
+            # check if `config.json` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
+
     def test_save_pretrained(self) -> None:
         seed = 420
         torch.manual_seed(seed)
@@ -149,13 +194,13 @@ def test_save_pretrained(self) -> None:
                 )
 
             # check if `adapter_model.bin` is present
-            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")))
 
             # check if `adapter_config.json` is present
             self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
 
-            # check if `pytorch_model.bin` is not present
-            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
+            # check if `model.safetensors` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "model.safetensors")))
 
             # check if `config.json` is not present
             self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
@@ -199,13 +244,13 @@ def test_save_pretrained_selected_adapters(self) -> None:
                 )
 
             # check if `adapter_model.bin` is present
-            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")))
 
             # check if `adapter_config.json` is present
             self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
 
-            # check if `pytorch_model.bin` is not present
-            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
+            # check if `model.safetensors` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "model.safetensors")))
 
             # check if `config.json` is not present
             self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index ca3f1cc48a..da43d78bf9 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -408,6 +408,10 @@ def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls
     def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(TEST_CASES)
+    def test_save_pretrained_pickle(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False)
+
     @parameterized.expand(TEST_CASES)
     def test_from_pretrained_config_construction(self, test_name, model_id, config_cls, config_kwargs):
         self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs)
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index 3afd85c015..b6d76216d0 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -136,10 +136,18 @@ def test_prompt_tuning_config_invalid_args(self):
     def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_save_pretrained_pickle(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_save_pretrained_selected_adapters(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_save_pretrained_selected_adapters_pickle(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs, safe_serialization=False)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_from_pretrained_config_construction(self, test_name, model_id, config_cls, config_kwargs):
         self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs)
diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py
index e1f9cb239d..8aab9ed044 100644
--- a/tests/test_encoder_decoder_models.py
+++ b/tests/test_encoder_decoder_models.py
@@ -70,10 +70,18 @@ def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls
     def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_save_pretrained_pickle(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_save_pretrained_selected_adapters(self, test_name, model_id, config_cls, config_kwargs):
         self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
+    def test_save_pretrained_selected_adapters_pickle(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs, safe_serialization=False)
+
     @parameterized.expand(PeftTestConfigManager.get_grid_parameters(FULL_GRID))
     def test_from_pretrained_config_construction(self, test_name, model_id, config_cls, config_kwargs):
         self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs)
diff --git a/tests/test_multitask_prompt_tuning.py b/tests/test_multitask_prompt_tuning.py
index 5fc9f36a1d..9aa6b8d7d9 100644
--- a/tests/test_multitask_prompt_tuning.py
+++ b/tests/test_multitask_prompt_tuning.py
@@ -145,7 +145,52 @@ def test_save_pretrained(self) -> None:
                     )
                 )
 
-            # check if `adapter_model.bin` is present
+            # check if `adapter_model.safetensors` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")))
+
+            # check if `adapter_config.json` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
+
+            # check if `pytorch_model.bin` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
+
+            # check if `config.json` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
+
+    def test_save_pretrained_regression(self) -> None:
+        seed = 420
+        torch.manual_seed(seed)
+        model = LlamaForCausalLM(self._create_test_llama_config())
+        model = get_peft_model(model, self._create_multitask_prompt_tuning_config())
+        model = model.to(self.torch_device)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            model.save_pretrained(tmp_dirname, safe_serialization=False)
+
+            torch.manual_seed(seed)
+            model_from_pretrained = LlamaForCausalLM(self._create_test_llama_config())
+            model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
+
+            # check if the state dicts are equal
+            state_dict = get_peft_model_state_dict(model)
+
+            state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained)
+
+            # check if same keys
+            self.assertEqual(state_dict.keys(), state_dict_from_pretrained.keys())
+
+            # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate).
+            self.assertEqual(len(list(state_dict.keys())), 3)
+
+            # check if tensors equal
+            for key in state_dict.keys():
+                self.assertTrue(
+                    torch.allclose(
+                        state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device)
+                    )
+                )
+
+            # check if `adapter_model.bin` is present for regression
             self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
 
             # check if `adapter_config.json` is present
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 3fa7da6163..7521c7df1c 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -263,7 +263,7 @@ def make_inputs_require_grad(module, input, output):
 
         self.assertTrue(dummy_output.requires_grad)
 
-    def _test_save_pretrained(self, model_id, config_cls, config_kwargs):
+    def _test_save_pretrained(self, model_id, config_cls, config_kwargs, safe_serialization=True):
         # ensure that the weights are randomly initialized
         if issubclass(config_cls, LoraConfig):
             config_kwargs = config_kwargs.copy()
@@ -281,7 +281,10 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs):
         model = model.to(self.torch_device)
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-            model.save_pretrained(tmp_dirname)
+            if safe_serialization:
+                model.save_pretrained(tmp_dirname)
+            else:
+                model.save_pretrained(tmp_dirname, safe_serialization=False)
 
             model_from_pretrained = self.transformers_class.from_pretrained(model_id)
             model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
@@ -305,14 +308,16 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs):
                     )
                 )
 
-            # check if `adapter_model.bin` is present
-            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
+            target_adapter_filename = "adapter_model.safetensors" if safe_serialization else "adapter_model.bin"
+
+            # check if `adapter_model.safetensors` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, target_adapter_filename)))
 
             # check if `adapter_config.json` is present
             self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
 
-            # check if `pytorch_model.bin` is not present
-            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
+            # check if `model.safetensors` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "model.safetensors")))
 
             # check if `config.json` is not present
             self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
@@ -320,7 +325,7 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs):
             self.check_modelcard(tmp_dirname, model)
             self.check_config_json(tmp_dirname, model)
 
-    def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs):
+    def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs, safe_serialization=True):
         if issubclass(config_cls, AdaLoraConfig):
             # AdaLora does not support adding more than 1 adapter
             return
@@ -349,7 +354,10 @@ def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_k
         model.add_adapter("new_adapter", new_adapter_config)
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-            model.save_pretrained(tmp_dirname)
+            if safe_serialization:
+                model.save_pretrained(tmp_dirname)
+            else:
+                model.save_pretrained(tmp_dirname, safe_serialization=False)
 
             model_from_pretrained = self.transformers_class.from_pretrained(model_id)
             model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
@@ -379,17 +387,19 @@ def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_k
                     )
                 )
 
-            # check if `adapter_model.bin` is present
-            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
-            self.assertTrue(os.path.exists(os.path.join(new_adapter_dir, "adapter_model.bin")))
+            target_adapter_filename = "adapter_model.safetensors" if safe_serialization else "adapter_model.bin"
+
+            # check if `adapter_model.safetensors` is present
+            self.assertTrue(os.path.exists(os.path.join(tmp_dirname, target_adapter_filename)))
+            self.assertTrue(os.path.exists(os.path.join(new_adapter_dir, target_adapter_filename)))
 
             # check if `adapter_config.json` is present
             self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
             self.assertTrue(os.path.exists(os.path.join(new_adapter_dir, "adapter_config.json")))
 
-            # check if `pytorch_model.bin` is not present
-            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
-            self.assertFalse(os.path.exists(os.path.join(new_adapter_dir, "pytorch_model.bin")))
+            # check if `model.safetensors` is not present
+            self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "model.safetensors")))
+            self.assertFalse(os.path.exists(os.path.join(new_adapter_dir, "model.safetensors")))
 
             # check if `config.json` is not present
             self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))

From 70302d7b4fc667f6b2e80ac1b8cccde081270d1e Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 16 Nov 2023 12:05:22 +0100
Subject: [PATCH 37/65] FEAT: Merging only specified `adapter_names` when
 calling `merge` (#1132)

* working v1

* add tests

* remove

* add it also for lokr and loha, left a todo

* Update tests/testing_common.py

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>

* better test

* up

* fix tests

* credits contrib and suggestions from disscussions

* credits contrib and suggestions from disscussions

* address last comments

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 src/peft/tuners/adalora/layer.py | 11 +++++-
 src/peft/tuners/ia3/layer.py     | 22 ++++++++---
 src/peft/tuners/ia3/model.py     |  8 +++-
 src/peft/tuners/lora/layer.py    | 35 +++++++++++++----
 src/peft/tuners/lora/model.py    | 23 ++++++++---
 src/peft/tuners/lycoris_utils.py | 19 +++++----
 tests/test_custom_models.py      | 56 +++++++++++++++++++++++++++
 tests/test_decoder_models.py     | 13 +++++++
 tests/testing_common.py          | 66 ++++++++++++++++++++++++++++++++
 9 files changed, 225 insertions(+), 28 deletions(-)

diff --git a/src/peft/tuners/adalora/layer.py b/src/peft/tuners/adalora/layer.py
index 5777581e9b..d9fbf903c9 100644
--- a/src/peft/tuners/adalora/layer.py
+++ b/src/peft/tuners/adalora/layer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import warnings
+from typing import List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -100,7 +101,7 @@ def __init__(
         self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
         self.set_adapter(adapter_name)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -109,13 +110,19 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
-        for active_adapter in self.active_adapters:
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.lora_A.keys():
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index cd278a450a..50696a0e08 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import warnings
-from typing import Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -105,7 +105,7 @@ def update_layer(self, adapter_name, init_ia3_weights):
         self.to(self.weight.device)
         self.set_adapter(self.active_adapters)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -114,6 +114,9 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
@@ -121,7 +124,10 @@ def merge(self, safe_merge: bool = False) -> None:
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
 
-        for active_adapter in self.active_adapters:
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.ia3_l.keys():
                 if safe_merge:
                     orig_weights = transpose(self.weight, self.fan_in_fan_out).clone()
@@ -237,7 +243,7 @@ def update_layer(self, adapter_name, init_ia3_weights):
         self.to(self.weight.device)
         self.set_adapter(self.active_adapters)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -246,6 +252,9 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
@@ -253,7 +262,10 @@ def merge(self, safe_merge: bool = False) -> None:
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
 
-        for active_adapter in self.active_adapters:
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.ia3_l.keys():
                 ia3_scaling = self.ia3_l[active_adapter].data
                 if not self.is_feedforward:
diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index f18fbf6b4b..29802359f7 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -17,6 +17,7 @@
 import warnings
 from dataclasses import asdict
 from enum import Enum
+from typing import List, Optional
 
 import torch
 from transformers.pytorch_utils import Conv1D
@@ -297,7 +298,7 @@ def _prepare_adapter_config(self, peft_config, model_config):
             ]
         return peft_config
 
-    def merge_and_unload(self, safe_merge: bool = False):
+    def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None):
         r"""
         This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
         as a standalone model.
@@ -307,6 +308,9 @@ def merge_and_unload(self, safe_merge: bool = False):
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if getattr(self.model, "is_loaded_in_8bit", False):
             raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode")
@@ -345,7 +349,7 @@ def merge_and_unload(self, safe_merge: bool = False):
                 else:
                     new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
 
-            target.merge(safe_merge=safe_merge)
+            target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
             self._replace_module(parent, target_name, new_module, target)
 
         return self.model
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index ab9eb83fcc..e2ced1eee9 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -15,7 +15,7 @@
 
 import math
 import warnings
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -218,7 +218,7 @@ def __init__(
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
         self.set_adapter(adapter_name)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -227,13 +227,20 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
-        for active_adapter in self.active_adapters:
+
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.lora_A.keys():
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
@@ -340,7 +347,7 @@ def __init__(
         self.update_layer_embedding(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
         self.set_adapter(adapter_name)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights into the base weights
 
@@ -349,13 +356,20 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
-        for active_adapter in self.active_adapters:
+
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.lora_embedding_A.keys():
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
@@ -480,7 +494,7 @@ def __init__(
         self.update_layer_conv2d(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
         self.set_adapter(adapter_name)
 
-    def merge(self, safe_merge: bool = False) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
         Merge the active adapter weights inside the base weights
 
@@ -489,13 +503,20 @@ def merge(self, safe_merge: bool = False) -> None:
                 If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                 before merging the weights. This is useful if you want to check if the merge operation will produce
                 NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         """
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
-        for active_adapter in self.active_adapters:
+
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self.lora_A.keys():
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 85bc8b2fd4..6b76ad9b6c 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -19,6 +19,7 @@
 from enum import Enum
 from functools import reduce
 from itertools import chain
+from typing import List, Optional
 
 import torch
 from torch import nn
@@ -376,7 +377,13 @@ def _prepare_adapter_config(peft_config, model_config):
             )
         return peft_config
 
-    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False, safe_merge: bool = False):
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[List[str]] = None,
+    ):
         if merge:
             if getattr(self.model, "quantization_method", None) == "gptq":
                 raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
@@ -430,7 +437,7 @@ def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False, sa
                     else:
                         new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
                 if merge:
-                    target.merge(safe_merge=safe_merge)
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
                 self._replace_module(parent, target_name, new_module, target)
 
             # save any additional trainable modules part of `modules_to_save`
@@ -671,7 +678,9 @@ def delete_adapter(self, adapter_name: str):
 
         self.active_adapter = new_adapter or []
 
-    def merge_and_unload(self, progressbar: bool = False, safe_merge: bool = False):
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
+    ):
         r"""
         This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model
         as a standalone model.
@@ -682,7 +691,9 @@ def merge_and_unload(self, progressbar: bool = False, safe_merge: bool = False):
             safe_merge (`bool`):
                 whether to activate the safe merging check to check if there is any potential Nan in the adapter
                 weights
-
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
         Example:
 
         ```py
@@ -695,7 +706,9 @@ def merge_and_unload(self, progressbar: bool = False, safe_merge: bool = False):
         >>> merged_model = model.merge_and_unload()
         ```
         """
-        return self._unload_and_optionally_merge(progressbar=progressbar, safe_merge=safe_merge)
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
 
     def unload(self):
         """
diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py
index b82c960230..d3085c4831 100644
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@@ -18,7 +18,7 @@
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from itertools import chain
-from typing import Dict, Optional, Set, Type, Union
+from typing import Dict, List, Optional, Set, Type, Union
 
 import torch
 import torch.nn as nn
@@ -134,13 +134,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
         ...
 
-    def merge(self) -> None:
+    def merge(self, adapter_names: Optional[List[str]] = None) -> None:
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
-        for active_adapter in self.active_adapters:
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
             if active_adapter in self._available_adapters:
                 self.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
@@ -320,7 +323,9 @@ def _set_adapter_layers(self, enabled=True):
             if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
                 module.enable_adapters(enabled)
 
-    def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False):
+    def _unload_and_optionally_merge(
+        self, merge=True, progressbar: bool = False, adapter_names: Optional[List[str]] = None
+    ):
         if merge:
             if getattr(self.model, "quantization_method", None) == "gptq":
                 raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
@@ -355,7 +360,7 @@ def _unload_and_optionally_merge(self, merge=True, progressbar: bool = False):
                         "Cannot convert current module to torch module, currently only adapters for nn.Linear and nn.Conv2d are supported"
                     )
                 if merge:
-                    target.merge()
+                    target.merge(adapter_names=adapter_names)
                 self._replace_module(parent, target_name, new_module, target)
 
             # save any additional trainable modules part of `modules_to_save`
@@ -370,8 +375,8 @@ def enable_adapter_layers(self):
     def disable_adapter_layers(self):
         self._set_adapter_layers(enabled=False)
 
-    def merge_and_unload(self, progressbar: bool = False):
-        return self._unload_and_optionally_merge(progressbar=progressbar)
+    def merge_and_unload(self, progressbar: bool = False, adapter_names: Optional[List[str]] = None):
+        return self._unload_and_optionally_merge(progressbar=progressbar, adapter_names=adapter_names)
 
     def set_adapter(self, adapter_name):
         for module in self.model.modules():
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index da43d78bf9..14ae59b05c 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -934,6 +934,62 @@ def test_multiple_active_adapters_merge_and_unmerge(
 
         self.assertTrue(torch.allclose(disabled_adapter_output, base_output, atol=1e-4))
 
+    @parameterized.expand(MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES)
+    def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2):
+        model = MLP(bias=tuner_method != "ia3")
+        model.eval()
+
+        config_1 = config_cls(**config_kwargs_1)
+        config_2 = config_cls(**config_kwargs_2)
+
+        model = get_peft_model(model, config_1)
+
+        dummy_input = self.prepare_inputs_for_testing()
+        model.eval()
+
+        with torch.inference_mode():
+            logits_adapter_1 = model(**dummy_input)[0]
+
+        model.add_adapter("adapter-2", config_2)
+        model.set_adapter("adapter-2")
+        model.eval()
+
+        with torch.inference_mode():
+            logits_adapter_2 = model(**dummy_input)[0]
+
+        self.assertFalse(torch.allclose(logits_adapter_1, logits_adapter_2, atol=1e-3, rtol=1e-3))
+
+        model.set_adapter("default")
+
+        with torch.inference_mode():
+            logits_adapter_1_after_set = model(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_adapter_1_after_set, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
+        model_copy = copy.deepcopy(model)
+        model_copy_2 = copy.deepcopy(model)
+        model_merged_all = model.merge_and_unload(adapter_names=["adapter-2", "default"])
+
+        with torch.inference_mode():
+            logits_merged_all = model_merged_all(**dummy_input)[0]
+
+        self.assertFalse(torch.allclose(logits_merged_all, logits_adapter_2, atol=1e-3, rtol=1e-3))
+        self.assertFalse(torch.allclose(logits_merged_all, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
+        model_merged_adapter_2 = model_copy.merge_and_unload(adapter_names=["adapter-2"])
+
+        with torch.inference_mode():
+            logits_merged_adapter_2 = model_merged_adapter_2(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_merged_adapter_2, logits_adapter_2, atol=1e-3, rtol=1e-3))
+
+        model_merged_adapter_default = model_copy_2.merge_and_unload(adapter_names=["default"])
+
+        with torch.inference_mode():
+            logits_merged_adapter_default = model_merged_adapter_default(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
 
 class RequiresGradTester(unittest.TestCase):
     """Test that requires_grad is set correctly in specific circumstances
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index b6d76216d0..a6b3d16d4d 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -165,6 +165,19 @@ def test_from_pretrained_config_construction(self, test_name, model_id, config_c
     def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
         self._test_merge_layers(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(
+        PeftTestConfigManager.get_grid_parameters(
+            {
+                "model_ids": PEFT_DECODER_MODELS_TO_TEST,
+                "lora_kwargs": {"init_lora_weights": [False]},
+                "ia3_kwargs": {"init_ia3_weights": [False]},
+                "task_type": "CAUSAL_LM",
+            },
+        )
+    )
+    def test_merge_layers_multi(self, test_name, model_id, config_cls, config_kwargs):
+        self._test_merge_layers_multi(model_id, config_cls, config_kwargs)
+
     @parameterized.expand(
         PeftTestConfigManager.get_grid_parameters(
             {
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 7521c7df1c..2c4a4f5b2b 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import json
 import os
 import pickle
@@ -571,6 +572,71 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs):
         logits_merged_from_pretrained = model_from_pretrained(**dummy_input)[0]
         self.assertTrue(torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol))
 
+    def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs):
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+
+        if ("gpt2" in model_id.lower()) and (config_cls == IA3Config):
+            self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)")
+
+        config = config_cls(
+            base_model_name_or_path=model_id,
+            **config_kwargs,
+        )
+
+        if config.peft_type not in supported_peft_types:
+            return
+
+        model = self.transformers_class.from_pretrained(model_id)
+        model = get_peft_model(model, config)
+
+        model = model.to(self.torch_device)
+
+        dummy_input = self.prepare_inputs_for_testing()
+        model.eval()
+
+        with torch.inference_mode():
+            logits_adapter_1 = model(**dummy_input)[0]
+
+        model.add_adapter("adapter-2", config)
+        model.set_adapter("adapter-2")
+        model.eval()
+
+        with torch.inference_mode():
+            logits_adapter_2 = model(**dummy_input)[0]
+
+        self.assertFalse(torch.allclose(logits_adapter_1, logits_adapter_2, atol=1e-3, rtol=1e-3))
+
+        model.set_adapter("default")
+
+        with torch.inference_mode():
+            logits_adapter_1_after_set = model(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_adapter_1_after_set, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
+        model_copy = copy.deepcopy(model)
+        model_copy_2 = copy.deepcopy(model)
+        model_merged_all = model.merge_and_unload(adapter_names=["adapter-2", "default"])
+
+        with torch.inference_mode():
+            logits_merged_all = model_merged_all(**dummy_input)[0]
+
+        self.assertFalse(torch.allclose(logits_merged_all, logits_adapter_2, atol=1e-3, rtol=1e-3))
+        self.assertFalse(torch.allclose(logits_merged_all, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
+        model_merged_adapter_2 = model_copy.merge_and_unload(adapter_names=["adapter-2"])
+
+        with torch.inference_mode():
+            logits_merged_adapter_2 = model_merged_adapter_2(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_merged_adapter_2, logits_adapter_2, atol=1e-3, rtol=1e-3))
+
+        model_merged_adapter_default = model_copy_2.merge_and_unload(adapter_names=["default"])
+
+        with torch.inference_mode():
+            logits_merged_adapter_default = model_merged_adapter_default(**dummy_input)[0]
+
+        self.assertTrue(torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3))
+
     def _test_generate(self, model_id, config_cls, config_kwargs):
         model = self.transformers_class.from_pretrained(model_id)
         config = config_cls(

From 5a3a5acff2d679358251742564f7b12efbee3a41 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 16 Nov 2023 12:45:12 +0100
Subject: [PATCH 38/65] Refactor base layer pattern (#1106)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Description

Refactor all tuners (where it applies, i.e. not prompt tuning) to use
the "base layer pattern". This means that the adapter layer will always
hold a reference to the original layer that it modifies. This pattern is
already partly used (e.g. LoRA bnb, gptq layers), now it is consistently
used everywhere when applicable.

This PR is a companion PR to #1069, where I first added these changes.
They are now extracted to a separate PR to make code review easier and
to advance more quickly.

Implementation

The main change is that the adapter layer wraps the original layer and
calls forward on that layer, instead of doing stuff like this:

F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)

which completely circumvents the call to the target layer's forward
method. With the base layer pattern, we now call the target layer's
forward method. Therefore, if the target layer is another adapter
layer (which will be crucial for mixed adapters), we call its forward
method correctly. Also, this should allow passing extra arguments, like
lora_scale to forward.

This change has the nice side benefit that we no longer need to use
_init_empty_weights -- in fact, we don't initialize any of the target
layer's weights anymore, since we have a reference to it. There is thus
no risk of having slow but superfluous initialization of layers.

Moreover, I could greatly simplify merge_and_unload by just using the
base_layer instead of having to create a completely new layer. For
OPT-350m, this results in a 15x speedup.

Note that same as for the bnb layers, this should be backwards
incompatible, since the adapter weights and their state_dicts are not
affected by this change. I used #1115 for regression testing.

Somewhat unrelated changes

During debugging, I got very annoyed with the fact that the reprs of
adapter layers and normal PyTorch layers are hard to distinguish, e.g.
the type is just "Linear". Now, for adapter layers, it is prefixed by
the adapter type, e.g. "lora.Linear". This should have no further
implications except for the repr (e.g. state_dict remains unaffected).

For LoHa and LoKr, I had to change the init of weights when using
init_weights=False. This is because of what is discussed in Numerical
instabilities with LoHa #1058.

IA³ now has the unload method too.

LoHa and LoKr now support safe_merge=True when merging layers.

Migration guide

For 99% of users, the code should continue working as ususal, because
the API stays the same. Only low level details have been changed.

Code that relies on isinstance checks on specific PEFT classes may
break. E.g. the LoRA Linear layer no longer inherits from nn.Linear. It
is, however, still a BaseTunerLayer. The same logic applies for other
layer types like Conv2d and for other tuners like IA³.

To retrieve the base layer of an adapter layer, you should now call
module.get_base_layer() if you deal with a BaseTunerLayer. Don't rely on
something like module.weight being present (though it might be).
---
 src/peft/tuners/__init__.py             |   7 -
 src/peft/tuners/adalora/bnb.py          |  68 ++++----
 src/peft/tuners/adalora/gptq.py         |  21 ++-
 src/peft/tuners/adalora/layer.py        |  64 ++++---
 src/peft/tuners/adalora/model.py        |  36 ++--
 src/peft/tuners/ia3/bnb.py              |  86 ++++-----
 src/peft/tuners/ia3/layer.py            | 182 +++++++++-----------
 src/peft/tuners/ia3/model.py            | 197 +++++++++++----------
 src/peft/tuners/loha/layer.py           | 161 ++++++++++-------
 src/peft/tuners/loha/model.py           |  36 +++-
 src/peft/tuners/lokr/layer.py           | 154 ++++++++++-------
 src/peft/tuners/lokr/model.py           |  36 +++-
 src/peft/tuners/lora/bnb.py             |  54 +++---
 src/peft/tuners/lora/gptq.py            |  23 +--
 src/peft/tuners/lora/layer.py           | 208 +++++++++++-----------
 src/peft/tuners/lora/model.py           | 155 ++++++-----------
 src/peft/tuners/lycoris_utils.py        | 220 +++++++++++-------------
 src/peft/tuners/tuners_utils.py         |  29 ++++
 src/peft/utils/other.py                 |  14 ++
 tests/test_custom_models.py             |  95 +++++++++-
 tests/test_decoder_models.py            |   1 +
 tests/test_feature_extraction_models.py |   1 +
 tests/testing_common.py                 |  11 +-
 23 files changed, 991 insertions(+), 868 deletions(-)

diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index dd4c94b947..b357d47dc1 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -27,10 +27,3 @@
 from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
 from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
-
-# Mapping of tuners that support direct plugging
-TUNERS_MAPPING = {
-    "LORA": LoraModel,
-    "IA3": IA3Model,
-    "ADALORA": AdaLoraModel,
-}
diff --git a/src/peft/tuners/adalora/bnb.py b/src/peft/tuners/adalora/bnb.py
index 3ccfd91b2b..a37745569a 100644
--- a/src/peft/tuners/adalora/bnb.py
+++ b/src/peft/tuners/adalora/bnb.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bitsandbytes as bnb
+from typing import Any
+
 import torch
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
@@ -23,38 +24,28 @@
 
 if is_bnb_available():
 
-    class SVDLinear8bitLt(bnb.nn.Linear8bitLt, AdaLoraLayer):
+    class SVDLinear8bitLt(torch.nn.Module, AdaLoraLayer):
         # Low-rank matrix for SVD-based adaptation
         def __init__(
             self,
-            adapter_name,
-            in_features,
-            out_features,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
             r: int = 0,
             lora_alpha: int = 1,
             lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
             **kwargs,
         ) -> None:
-            bnb.nn.Linear8bitLt.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                has_fp16_weights=kwargs.get("has_fp16_weights", True),
-                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
-                threshold=kwargs.get("threshold", 0.0),
-                index=kwargs.get("index", None),
-            )
-            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+            super().__init__()
+            AdaLoraLayer.__init__(self, base_layer)
             # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
+            self.get_base_layer().weight.requires_grad = False
 
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.set_adapter(adapter_name)
 
         def forward(self, x: torch.Tensor) -> torch.Tensor:
-            result = super().forward(x)
+            # note: no check for self.merged because merging is not supported (yet)
+            result = self.base_layer(x)
 
             if self.disable_adapters:
                 return result
@@ -82,40 +73,35 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 result += output
             return result
 
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
+
 
 if is_bnb_4bit_available():
 
-    class SVDLinear4bit(bnb.nn.Linear4bit, AdaLoraLayer):
+    class SVDLinear4bit(torch.nn.Module, AdaLoraLayer):
         # Low-rank matrix for SVD-based adaptation
         def __init__(
             self,
-            adapter_name,
-            in_features,
-            out_features,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
             r: int = 0,
             lora_alpha: int = 1,
             lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
             **kwargs,
         ) -> None:
-            bnb.nn.Linear4bit.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                compute_dtype=kwargs.get("compute_dtype", torch.float32),
-                compress_statistics=kwargs.get("compress_statistics", True),
-                quant_type=kwargs.get("quant_type", "nf4"),
-            )
-            AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+            super().__init__()
+            AdaLoraLayer.__init__(self, base_layer)
             # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
+            self.get_base_layer().weight.requires_grad = False
 
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.set_adapter(adapter_name)
 
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            result = super().forward(x)
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
+            result = self.base_layer(x, *args, **kwargs)
 
             if self.disable_adapters:
                 return result
@@ -151,3 +137,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 output = output * scaling / ranknum
                 result += output
             return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
diff --git a/src/peft/tuners/adalora/gptq.py b/src/peft/tuners/adalora/gptq.py
index 92de32ac15..1c14ea9c44 100644
--- a/src/peft/tuners/adalora/gptq.py
+++ b/src/peft/tuners/adalora/gptq.py
@@ -20,22 +20,21 @@
 class SVDQuantLinear(torch.nn.Module, AdaLoraLayer):
     def __init__(
         self,
+        base_layer,
         adapter_name,
-        quant_linear_module,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
         **kwargs,
     ) -> None:
-        torch.nn.Module.__init__(self)
-        AdaLoraLayer.__init__(
-            self, in_features=quant_linear_module.infeatures, out_features=quant_linear_module.outfeatures
-        )
-        self.quant_linear_module = quant_linear_module
-        self.weight = quant_linear_module.qweight
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        super().__init__()
+        AdaLoraLayer.__init__(self, base_layer)
+
+        # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+        # for backwards compatibility
+        self.quant_linear_module = base_layer
         self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.set_adapter(adapter_name)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         result = self.quant_linear_module(x)
@@ -67,3 +66,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 output = output.to(expected_dtype)
             result += output
         return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
diff --git a/src/peft/tuners/adalora/layer.py b/src/peft/tuners/adalora/layer.py
index d9fbf903c9..b4a98de039 100644
--- a/src/peft/tuners/adalora/layer.py
+++ b/src/peft/tuners/adalora/layer.py
@@ -14,10 +14,9 @@
 # limitations under the License.
 
 import warnings
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from peft.tuners.lora import LoraLayer
@@ -30,12 +29,8 @@ class AdaLoraLayer(LoraLayer):
     adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
     # other_param_names is defined in LoraLayer
 
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-    ):
-        super().__init__(in_features, out_features)
+    def __init__(self, base_layer: nn.Module) -> None:
+        super().__init__(base_layer)
         self.lora_E = nn.ParameterDict({})
         self.lora_A = nn.ParameterDict({})
         self.lora_B = nn.ParameterDict({})
@@ -64,7 +59,12 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
         self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r)
         if init_lora_weights:
             self.reset_lora_parameters(adapter_name)
-        self.to(self.weight.device)
+
+        if hasattr(self.get_base_layer(), "qweight"):
+            # QuantLinear
+            self.to(self.get_base_layer().qweight.device)
+        else:
+            self.to(self.get_base_layer().weight.device)
         self.set_adapter(self.active_adapters)
 
     def reset_lora_parameters(self, adapter_name):
@@ -74,32 +74,27 @@ def reset_lora_parameters(self, adapter_name):
             nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)
 
 
-class SVDLinear(nn.Linear, AdaLoraLayer):
+class SVDLinear(nn.Module, AdaLoraLayer):
     # SVD-based adaptation by a dense layer
     def __init__(
         self,
+        base_layer: nn.Module,
         adapter_name: str,
-        in_features: int,
-        out_features: int,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         fan_in_fan_out: bool = False,
+        init_lora_weights: bool = True,
         **kwargs,
     ) -> None:
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        nn.Linear.__init__(self, in_features, out_features, **kwargs)
-        AdaLoraLayer.__init__(self, in_features=in_features, out_features=out_features)
+        super().__init__()
+        AdaLoraLayer.__init__(self, base_layer)
         # Freezing the pre-trained weight matrix
-        self.weight.requires_grad = False
+        self.get_base_layer().weight.requires_grad = False
 
         self.fan_in_fan_out = fan_in_fan_out
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
-
-        nn.Linear.reset_parameters(self)
+        self._active_adapter = adapter_name
         self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.set_adapter(adapter_name)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
@@ -119,15 +114,17 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
                 f"You are now additionally merging {','.join(self.active_adapters)}."
             )
+
         if adapter_names is None:
             adapter_names = self.active_adapters
 
         for active_adapter in adapter_names:
+            base_layer = self.get_base_layer()
             if active_adapter in self.lora_A.keys():
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
                     # because of the copy operation.
-                    orig_weights = self.weight.data.clone()
+                    orig_weights = base_layer.weight.data.clone()
                     orig_weights += self.get_delta_weight(active_adapter)
 
                     if not torch.isfinite(orig_weights).all():
@@ -135,9 +132,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
 
-                    self.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights
                 else:
-                    self.weight.data += self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -147,7 +144,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.lora_A.keys():
-                self.weight.data -= self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
         return (
@@ -156,19 +153,16 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
             / (self.ranknum[adapter] + 1e-5)
         )
 
-    def _linear(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         # TODO: SVDLinear does not convert dtype, unlike lora linear, is that correct?
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.lora_A.keys():
                     continue
@@ -183,8 +177,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return result
 
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "adalora." + rep
+
 
-class RankAllocator(object):
+class RankAllocator:
     """
     The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY
 
diff --git a/src/peft/tuners/adalora/model.py b/src/peft/tuners/adalora/model.py
index a863acce31..71f2ed7579 100644
--- a/src/peft/tuners/adalora/model.py
+++ b/src/peft/tuners/adalora/model.py
@@ -20,6 +20,7 @@
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
 from peft.tuners.lora import LoraConfig, LoraModel
+from peft.tuners.tuners_utils import BaseTunerLayer
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
     _freeze_adapter,
@@ -67,6 +68,8 @@ class AdaLoraModel(LoraModel):
         - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model.
     """
 
+    # Note: don't redefine prefix here, it should be inherited from LoraModel
+
     def __init__(self, model, config, adapter_name):
         super().__init__(model, config, adapter_name)
 
@@ -121,7 +124,7 @@ def _create_and_replace(
         loaded_in_4bit = optional_kwargs.get("loaded_in_4bit", False)
         if (loaded_in_8bit or loaded_in_4bit) and not is_bnb_available():
             raise ImportError(
-                "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
+                "To use AdaLora with 8-bit quantization, please install the `bitsandbytes` package. "
                 "You can install it with `pip install bitsandbytes`."
             )
         kwargs = {
@@ -138,7 +141,7 @@ def _create_and_replace(
         if quantization_config is not None:
             kwargs["gptq_quantization_config"] = quantization_config
 
-        # If it is not a LoraLayer, create a new module, else update it with new adapters
+        # If it is not an AdaLoraLayer, create a new module, else update it with new adapters
         if not isinstance(target, AdaLoraLayer):
             new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
             if adapter_name != self.active_adapter:
@@ -159,11 +162,15 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
         gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
         AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
 
-        bias = target.bias is not None
         loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
         loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
 
-        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
             kwargs.update(
                 {
                     "has_fp16_weights": target.state.has_fp16_weights,
@@ -172,8 +179,8 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "index": target.index,
                 }
             )
-            new_module = SVDLinear8bitLt(adapter_name, target.in_features, target.out_features, bias=bias, **kwargs)
-        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit):
+            new_module = SVDLinear8bitLt(target, adapter_name, **kwargs)
+        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit):
             fourbit_kwargs = kwargs.copy()
             fourbit_kwargs.update(
                 {
@@ -182,25 +189,18 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "quant_type": target.weight.quant_type,
                 }
             )
-            new_module = SVDLinear4bit(
-                adapter_name, target.in_features, target.out_features, bias=bias, **fourbit_kwargs
-            )
+            new_module = SVDLinear4bit(target, adapter_name, **fourbit_kwargs)
         elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
-            new_module = SVDQuantLinear(adapter_name, target, **kwargs)
-            target.weight = target.qweight
+            new_module = SVDQuantLinear(target, adapter_name, **kwargs)
         else:
-            if isinstance(target, torch.nn.Linear):
-                in_features, out_features = target.in_features, target.out_features
+            if isinstance(target_base_layer, torch.nn.Linear):
                 if kwargs["fan_in_fan_out"]:
                     warnings.warn(
                         "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                         "Setting fan_in_fan_out to False."
                     )
                     kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
-            elif isinstance(target, Conv1D):
-                in_features, out_features = (
-                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
-                )
+            elif isinstance(target_base_layer, Conv1D):
                 if not kwargs["fan_in_fan_out"]:
                     warnings.warn(
                         "fan_in_fan_out is set to False but the target module is `Conv1D`. "
@@ -212,7 +212,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     f"Target module {target} is not supported. "
                     f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
                 )
-            new_module = SVDLinear(adapter_name, in_features, out_features, bias=bias, **kwargs)
+            new_module = SVDLinear(target, adapter_name, **kwargs)
 
         return new_module
 
diff --git a/src/peft/tuners/ia3/bnb.py b/src/peft/tuners/ia3/bnb.py
index 2aa37c1d5c..2666b3ab6e 100644
--- a/src/peft/tuners/ia3/bnb.py
+++ b/src/peft/tuners/ia3/bnb.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bitsandbytes as bnb
+from typing import Any
+
 import torch
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
@@ -23,39 +24,27 @@
 
 if is_bnb_available():
 
-    class Linear8bitLt(bnb.nn.Linear8bitLt, IA3Layer):
+    class Linear8bitLt(torch.nn.Module, IA3Layer):
         # (IA)^3 implemented in a dense layer
         def __init__(
             self,
-            adapter_name,
-            in_features,
-            out_features,
-            is_feedforward,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            is_feedforward: bool,
+            init_ia3_weights: bool = True,
             **kwargs,
         ) -> None:
-            bnb.nn.Linear8bitLt.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                has_fp16_weights=kwargs.get("has_fp16_weights", True),
-                memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
-                threshold=kwargs.get("threshold", 0.0),
-                index=kwargs.get("index", None),
-            )
-            IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
-            self.is_feedforward = is_feedforward
+            super().__init__()
+            IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
 
             # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-
-            init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+            self.get_base_layer().weight.requires_grad = False
             self.update_layer(adapter_name, init_ia3_weights)
-            self.set_adapter(adapter_name)
 
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
             if self.disable_adapters:
-                return super().forward(x)
+                return self.base_layer(x)
 
             ia3_scaling = 1
             for active_adapter in self.active_adapters:
@@ -67,10 +56,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             if requires_conversion:
                 x = x.float()
             if self.is_feedforward:
-                result = super().forward(x * ia3_scaling)
+                result = self.base_layer(x * ia3_scaling)
                 expected_dtype = result.dtype
             else:
-                result = super().forward(x)
+                result = self.base_layer(x)
                 expected_dtype = result.dtype
                 result = result * ia3_scaling
 
@@ -79,41 +68,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             return result
 
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "ia3." + rep
+
 
 if is_bnb_4bit_available():
 
-    class Linear4bit(bnb.nn.Linear4bit, IA3Layer):
+    class Linear4bit(torch.nn.Module, IA3Layer):
         # IA3 implemented in a dense layer
         def __init__(
             self,
-            adapter_name,
-            in_features,
-            out_features,
-            is_feedforward,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            is_feedforward: bool,
+            init_ia3_weights: bool = True,
             **kwargs,
         ) -> None:
-            bnb.nn.Linear4bit.__init__(
-                self,
-                in_features,
-                out_features,
-                bias=kwargs.get("bias", True),
-                compute_dtype=kwargs.get("compute_dtype", torch.float32),
-                compress_statistics=kwargs.get("compress_statistics", True),
-                quant_type=kwargs.get("quant_type", "nf4"),
-            )
-            IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
-            self.is_feedforward = is_feedforward
+            super().__init__()
+            IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
 
             # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-
-            init_ia3_weights = kwargs.pop("init_ia3_weights", True)
+            self.get_base_layer().weight.requires_grad = False
             self.update_layer(adapter_name, init_ia3_weights)
-            self.set_adapter(adapter_name)
 
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
             if self.disable_adapters:
-                return super().forward(x)
+                return self.base_layer(x)
 
             ia3_scaling = 1
             for active_adapter in self.active_adapters:
@@ -125,10 +107,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             if requires_conversion:
                 x = x.float()
             if self.is_feedforward:
-                result = super().forward(x * ia3_scaling)
+                result = self.base_layer(x * ia3_scaling)
                 expected_dtype = result.dtype
             else:
-                result = super().forward(x)
+                result = self.base_layer(x)
                 expected_dtype = result.dtype
                 result = result * ia3_scaling
 
@@ -140,3 +122,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 result = result.to(expected_dtype)
 
             return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "ia3." + rep
diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index 50696a0e08..45ef388399 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -14,11 +14,11 @@
 # limitations under the License.
 
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
+from transformers.pytorch_utils import Conv1D
 
 from peft.tuners.tuners_utils import BaseTunerLayer
 from peft.utils import transpose
@@ -30,20 +30,30 @@ class IA3Layer(BaseTunerLayer):
     # All names of other parameters that may contain adapter-related parameters
     other_layer_names = ("scaling",)
 
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        is_feedforward: bool,
-    ):
+    def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None:
+        self.base_layer = base_layer
         self.scaling = {}
         self.ia3_l = nn.ParameterDict({})
         # Mark the weight as unmerged
         self._disable_adapters = False
         self.merged_adapters = []
+        self.is_feedforward = is_feedforward
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            in_features, out_features = base_layer.in_channels, base_layer.out_channels
+        elif isinstance(base_layer, nn.Embedding):
+            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
+        elif isinstance(base_layer, Conv1D):
+            in_features, out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
         self.in_features = in_features
         self.out_features = out_features
-        self.is_feedforward = is_feedforward
 
     def update_layer(self, adapter_name, init_ia3_weights):
         # Actual trainable parameters
@@ -54,7 +64,7 @@ def update_layer(self, adapter_name, init_ia3_weights):
         self.ia3_l[adapter_name] = nn.Parameter(weight)
         if init_ia3_weights:
             self.reset_ia3_parameters(adapter_name)
-        self.to(self.weight.device)
+        self.to(self.get_base_layer().weight.device)
         self.set_adapter(self.active_adapters)
 
     def reset_ia3_parameters(self, adapter_name):
@@ -63,35 +73,24 @@ def reset_ia3_parameters(self, adapter_name):
             nn.init.constant_(self.ia3_l[adapter_name], 1.0)
 
 
-class Linear(nn.Linear, IA3Layer):
+class Linear(nn.Module, IA3Layer):
     # (IA)^3 implemented in a dense layer
     def __init__(
         self,
+        base_layer: nn.Module,
         adapter_name: str,
-        in_features: int,
-        out_features: int,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
         is_target_conv_1d_layer: bool = False,  # whether target module is a conv1d layer. useful while unloading later
+        init_ia3_weights: bool = True,  # whether to initialize IA3 weights
         **kwargs,
     ) -> None:
-        init_ia3_weights = kwargs.pop("init_ia3_weights", True)
-
-        nn.Linear.__init__(self, in_features, out_features, **kwargs)
-        IA3Layer.__init__(self, in_features=in_features, out_features=out_features, is_feedforward=is_feedforward)
-        self.is_feedforward = is_feedforward
-        # Freezing the pre-trained weight matrix
-        self.weight.requires_grad = False
-
+        super().__init__()
+        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
         self.fan_in_fan_out = fan_in_fan_out
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
-
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
-
-        nn.Linear.reset_parameters(self)
+        self._active_adapter = adapter_name
         self.update_layer(adapter_name, init_ia3_weights)
-        self.set_adapter(adapter_name)
 
     def update_layer(self, adapter_name, init_ia3_weights):
         # Actual trainable parameters
@@ -102,7 +101,7 @@ def update_layer(self, adapter_name, init_ia3_weights):
         self.ia3_l[adapter_name] = nn.Parameter(weight)
         if init_ia3_weights:
             self.reset_ia3_parameters(adapter_name)
-        self.to(self.weight.device)
+        self.to(self.get_base_layer().weight.device)
         self.set_adapter(self.active_adapters)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
@@ -129,24 +128,23 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
 
         for active_adapter in adapter_names:
             if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
+                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out)
                 if safe_merge:
-                    orig_weights = transpose(self.weight, self.fan_in_fan_out).clone()
-                    orig_weights = torch.mul(orig_weights.data, self.ia3_l[active_adapter].data)
+                    orig_weights = base_layer.weight.data
+                    orig_weights = torch.mul(orig_weights, ia3_l)
 
                     if not torch.isfinite(orig_weights).all():
                         raise ValueError(
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
-                    self.weight.data = orig_weights
-                    self.weight = transpose(self.weight, self.fan_in_fan_out)
+                    base_layer.weight.data = orig_weights
                 else:
-                    self.weight = transpose(self.weight, self.fan_in_fan_out)
-                    self.weight.data = torch.mul(self.weight.data, self.ia3_l[active_adapter].data)
-                    self.weight = transpose(self.weight, self.fan_in_fan_out)
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l)
 
-                if not self.is_feedforward and (self.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
-                    self.bias.data = torch.mul(self.bias.data, scaling.data)
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
 
                 self.merged_adapters.append(active_adapter)
 
@@ -159,27 +157,24 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.ia3_l.keys():
-                self.weight = transpose(self.weight, self.fan_in_fan_out)
-                # divide by (IA)^3 vector. Add tolerace to avoid division by zero
-                self.weight.data = torch.div(self.weight.data, self.ia3_l[active_adapter].data + 1e-8)
-                self.weight = transpose(self.weight, self.fan_in_fan_out)
+                base_layer = self.get_base_layer()
+                # Add tolerace to avoid division by zero
+                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l)
 
-                if not self.is_feedforward and (self.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
-                    self.bias.data = torch.div(self.bias.data, scaling.data + 1e-8)
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8)
 
-    def _linear(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         dtype = previous_dtype = x.dtype
 
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
             ia3_scaling = 1
             for active_adapter in self.active_adapters:
@@ -190,46 +185,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             if self.is_feedforward:
                 x = x.to(dtype)
-                # TODO: self.weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
                 # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(self.weight.dtype)
-                result = self._linear(interm)
+                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                result = self.base_layer(interm, *args, **kwargs)
             else:
-                result = self._linear(x)
+                result = self.base_layer(x, *args, **kwargs)
                 result = result.to(dtype) * ia3_scaling
 
         result = result.to(previous_dtype)
         return result
 
 
-class Conv2d(nn.Conv2d, IA3Layer):
+class Conv2d(nn.Module, IA3Layer):
     def __init__(
         self,
+        base_layer: nn.Module,
         adapter_name: str,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]] = 1,
-        padding: Union[int, Tuple[int]] = 0,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        init_ia3_weights: bool = True,
         **kwargs,
     ) -> None:
-        init_ia3_weights = kwargs.pop("init_ia3_weights", True)
-
-        nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        IA3Layer.__init__(self, in_features=in_channels, out_features=out_channels, is_feedforward=is_feedforward)
-        self.is_feedforward = is_feedforward
-        # Freezing the pre-trained weight matrix
-        self.weight.requires_grad = False
-
+        super().__init__()
+        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
         self.fan_in_fan_out = fan_in_fan_out
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
+        self._active_adapter = adapter_name
 
-        nn.Conv2d.reset_parameters(self)
         self.update_layer(adapter_name, init_ia3_weights)
-        self.set_adapter(adapter_name)
 
     def update_layer(self, adapter_name, init_ia3_weights):
         # Actual trainable parameters
@@ -240,7 +223,7 @@ def update_layer(self, adapter_name, init_ia3_weights):
         self.ia3_l[adapter_name] = nn.Parameter(weight)
         if init_ia3_weights:
             self.reset_ia3_parameters(adapter_name)
-        self.to(self.weight.device)
+        self.to(self.get_base_layer().weight.device)
         self.set_adapter(self.active_adapters)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
@@ -267,25 +250,26 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
 
         for active_adapter in adapter_names:
             if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
                 ia3_scaling = self.ia3_l[active_adapter].data
                 if not self.is_feedforward:
                     ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
 
                 if safe_merge:
-                    output_weight = torch.mul(self.weight.data, ia3_scaling).clone()
+                    output_weight = torch.mul(base_layer.weight.data, ia3_scaling).clone()
 
                     if not torch.isfinite(output_weight).all():
                         raise ValueError(
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
 
-                    self.weight.data = output_weight
+                    base_layer.weight.data = output_weight
                 else:
-                    self.weight.data = torch.mul(self.weight.data, ia3_scaling)
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_scaling)
 
-                if not self.is_feedforward and (self.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
-                    self.bias.data = torch.mul(self.bias.data, scaling.data)
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
 
                 self.merged_adapters.append(active_adapter)
 
@@ -298,36 +282,26 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
                 # divide by (IA)^3 vector. Add tolerace to avoid division by zero
                 ia3_scaling = self.ia3_l[active_adapter].data
                 if not self.is_feedforward:
                     ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
-                self.weight.data = torch.div(self.weight.data, ia3_scaling + 1e-8)
-
-                if not self.is_feedforward and (self.bias is not None):
-                    scaling = self.ia3_l[active_adapter].reshape(self.bias.shape)
-                    self.bias.data = torch.mul(self.bias.data, scaling.data)
-
-    def _conv2d(self, input: torch.Tensor) -> torch.Tensor:
-        return F.conv2d(
-            input,
-            self.weight,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            groups=self.groups,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_scaling + 1e-8)
+
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         previous_dtype = x.dtype
 
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._conv2d(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._conv2d(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
             ia3_scaling = 1
             for active_adapter in self.active_adapters:
@@ -338,12 +312,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             if self.is_feedforward:
                 x = x.to(dtype)
-                # TODO: self.weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
                 # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(self.weight.dtype)
-                result = self._conv2d(interm)
+                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                result = self.base_layer(interm, *args, **kwargs)
             else:
-                result = self._conv2d(x)
+                result = self.base_layer(x, *args, **kwargs)
                 result = result.to(dtype) * ia3_scaling
 
         result = result.to(previous_dtype)
diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index 29802359f7..7b2f9d19d9 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -23,7 +23,7 @@
 from transformers.pytorch_utils import Conv1D
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
-from peft.tuners.tuners_utils import BaseTuner, check_target_module_exists
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
     TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
@@ -83,12 +83,16 @@ def __init__(self, model, config, adapter_name):
 
     @staticmethod
     def _create_new_module(ia3_config, adapter_name, target, **kwargs):
-        bias = hasattr(target, "bias") and target.bias is not None
         loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
         loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
         is_feedforward = kwargs.pop("is_feedforward", False)
 
-        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
             eightbit_kwargs = kwargs.copy()
             eightbit_kwargs.update(
                 {
@@ -98,15 +102,8 @@ def _create_new_module(ia3_config, adapter_name, target, **kwargs):
                     "index": target.index,
                 }
             )
-            new_module = Linear8bitLt(
-                adapter_name,
-                target.in_features,
-                target.out_features,
-                is_feedforward,
-                bias=bias,
-                **eightbit_kwargs,
-            )
-        elif loaded_in_4bit and isinstance(target, bnb.nn.Linear4bit):
+            new_module = Linear8bitLt(target, adapter_name, is_feedforward=is_feedforward, **eightbit_kwargs)
+        elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit):
             fourbit_kwargs = kwargs.copy()
             fourbit_kwargs.update(
                 {
@@ -115,56 +112,31 @@ def _create_new_module(ia3_config, adapter_name, target, **kwargs):
                     "quant_type": target.weight.quant_type,
                 }
             )
-            new_module = Linear4bit(
-                adapter_name,
-                target.in_features,
-                target.out_features,
-                is_feedforward,
-                bias=bias,
-                **fourbit_kwargs,
-            )
+            new_module = Linear4bit(target, adapter_name, is_feedforward=is_feedforward, **fourbit_kwargs)
         elif isinstance(target, torch.nn.Conv2d):
-            out_channels, in_channels = target.weight.size()[:2]
-            kernel_size = target.weight.size()[2:]
-            stride = target.stride
-            padding = target.padding
-            new_module = Conv2d(
-                adapter_name=adapter_name,
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=padding,
-                is_feedforward=is_feedforward,
-                **kwargs,
-            )
-        else:
-            if isinstance(target, torch.nn.Linear):
-                in_features, out_features = target.in_features, target.out_features
-                if kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
-                        "Setting fan_in_fan_out to False."
-                    )
-                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
-            elif isinstance(target, Conv1D):
-                in_features, out_features = (
-                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
+            new_module = Conv2d(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
                 )
-                kwargs["is_target_conv_1d_layer"] = True  # useful for unloading later
-                if not kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
-                        "Setting fan_in_fan_out to True."
-                    )
-                    kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
-            else:
-                raise ValueError(
-                    f"Target module {target} is not supported. "
-                    f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported."
+                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
+            new_module = Linear(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
+        elif isinstance(target_base_layer, Conv1D):
+            if not kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                    "Setting fan_in_fan_out to True."
                 )
+                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
             new_module = Linear(
-                adapter_name, in_features, out_features, is_feedforward=is_feedforward, bias=bias, **kwargs
+                target, adapter_name, is_feedforward=is_feedforward, is_target_conv_1d_layer=True, **kwargs
+            )
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. "
+                f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported."
             )
         return new_module
 
@@ -201,21 +173,16 @@ def _create_and_replace(
             "is_feedforward": is_feedforward,
         }
 
-        if isinstance(target, IA3Layer):
-            if target.is_feedforward != is_feedforward:
-                raise ValueError(
-                    "New adapter should have the same value for `is_feedforward` as previously added adapter."
-                )
-            if isinstance(target, torch.nn.Conv2d):
-                target.update_layer(
-                    adapter_name,
-                    ia3_config.init_ia3_weights,
-                )
-            else:  # Linear
-                target.update_layer(
-                    adapter_name,
-                    ia3_config.init_ia3_weights,
-                )
+        if isinstance(target, Conv2d):
+            target.update_layer(
+                adapter_name,
+                ia3_config.init_ia3_weights,
+            )
+        elif isinstance(target, Linear):
+            target.update_layer(
+                adapter_name,
+                ia3_config.init_ia3_weights,
+            )
         else:
             new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs)
             if adapter_name != self.active_adapter:
@@ -238,11 +205,22 @@ def _check_target_module_feedforward(ia3_config, key) -> bool:
     @staticmethod
     def _replace_module(parent, child_name, new_module, child):
         setattr(parent, child_name, new_module)
-        new_module.weight = child.weight
-        if child.bias is not None:
-            new_module.bias = child.bias
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        # layers with base_layer don't need the weight to be copied, as they have a reference already
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
         if getattr(child, "state", None) is not None:
-            new_module.state = child.state
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
             new_module.to(child.weight.device)
 
         # dispatch to correct device
@@ -298,7 +276,9 @@ def _prepare_adapter_config(self, peft_config, model_config):
             ]
         return peft_config
 
-    def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None):
+    def _unload_and_optionally_merge(
+        self, merge: bool = True, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
+    ):
         r"""
         This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
         as a standalone model.
@@ -325,31 +305,46 @@ def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[Lis
             except AttributeError:
                 continue
 
-            # save any additional trainable modules part of `modules_to_save`
-            if isinstance(target, ModulesToSaveWrapper):
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
                 setattr(parent, target_name, target.modules_to_save[target.active_adapter])
-                continue
 
-            if not isinstance(target, IA3Layer):
-                continue
+        return self.model
 
-            if isinstance(target, torch.nn.Conv2d):
-                new_module = torch.nn.Conv2d(
-                    target.in_channels,
-                    target.out_channels,
-                    kernel_size=target.kernel_size,
-                    stride=target.stride,
-                    padding=target.padding,
-                    dilation=target.dilation,
-                )
-            else:
-                bias = target.bias is not None
-                if getattr(target, "is_target_conv_1d_layer", False):
-                    new_module = Conv1D(target.out_features, target.in_features)
-                else:
-                    new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
+    def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None):
+        r"""
+        This method merges the IA³ layers into the base model. This is needed if someone wants to use the base model as
+        a standalone model.
 
-            target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-            self._replace_module(parent, target_name, new_module, target)
+        Args:
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
 
-        return self.model
+        Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModel
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
+        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
+        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
+        >>> merged_model = model.merge_and_unload()
+        ```
+        """
+        return self._unload_and_optionally_merge(safe_merge=safe_merge, adapter_names=adapter_names)
+
+    def unload(self):
+        """
+        Gets back the base model by removing all the IA³ modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
diff --git a/src/peft/tuners/loha/layer.py b/src/peft/tuners/loha/layer.py
index 2a8a205b02..4733336419 100644
--- a/src/peft/tuners/loha/layer.py
+++ b/src/peft/tuners/loha/layer.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Set, Tuple, Union
+from typing import Any, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -23,14 +23,14 @@
 from peft.tuners.lycoris_utils import LycorisLayer
 
 
-class LoHaLayer(LycorisLayer, nn.Module):
+class LoHaLayer(nn.Module, LycorisLayer):
     # All names of layers that may contain adapter weights
     adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2")
     # other_param_names is defined on parent class
 
-    def __init__(self):
-        LycorisLayer.__init__(self)
-        super(nn.Module, self).__init__()
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
 
         # LoHa info
         self.hada_w1_a = nn.ParameterDict({})
@@ -76,6 +76,21 @@ def reset_adapter_parameters(self, adapter_name: str):
             nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
             nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
 
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        # Original implementation performs initialization with normal distribution
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
+
+        # FedPara paper proposes to perform He initialization, let's stick with it
+        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
+        if adapter_name in self.hada_w1_a.keys():
+            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_b[adapter_name], a=math.sqrt(5))
+        if adapter_name in self.hada_t1.keys():
+            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
+
     def update_layer(
         self,
         adapter_name: str,
@@ -107,16 +122,20 @@ def update_layer(
         self.module_dropout[adapter_name] = module_dropout
 
         # Determine shape of LoHa weights
-        if isinstance(self, nn.Linear):
-            shape = tuple(self.weight.shape)
-        elif isinstance(self, nn.Conv2d):
-            use_effective_conv2d = use_effective_conv2d and self.kernel_size != (1, 1)
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1)
             if use_effective_conv2d:
-                shape = (self.out_channels, self.in_channels, *self.kernel_size)
+                shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size)
             else:
-                shape = (self.out_channels, self.in_channels * self.kernel_size[0] * self.kernel_size[1])
+                shape = (
+                    base_layer.out_channels,
+                    base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                )
         else:
-            raise TypeError(f"LoHa is not implemented for {type(self).__name__} layer")
+            raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}")
 
         # Create weights with provided shape
         self.create_adapter_parameters(adapter_name, r, shape)
@@ -124,9 +143,11 @@ def update_layer(
         # Initialize weights
         if init_weights:
             self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
 
         # Move new weights to device
-        weight = getattr(self, "weight", None)
+        weight = getattr(self.get_base_layer(), "weight", None)
         if weight is not None:
             # the layer is already completely initialized, this is an update
             if weight.dtype.is_floating_point or weight.dtype.is_complex:
@@ -156,7 +177,8 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
                 scale=torch.tensor(self.scaling[adapter_name]),
             )
 
-        weight = weight.reshape(self.weight.shape)
+        base_layer = self.get_base_layer()
+        weight = weight.reshape(base_layer.weight.shape)
 
         # Perform rank dropout during training - drop rows of addition weights
         rank_dropout = self.rank_dropout[adapter_name]
@@ -171,96 +193,107 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
 
         return weight
 
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs)
+
+        result = result.to(previous_dtype)
+        return result
 
-class Linear(LoHaLayer, nn.Linear):
+
+class Linear(LoHaLayer):
     """LoHa implemented in Linear layer"""
 
     def __init__(
         self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        device: Optional[Union[str, torch.device]] = None,
-        dtype: Optional[torch.dtype] = None,
+        base_layer: nn.Module,
         adapter_name: str = "default",
         r: int = 0,
         alpha: float = 0.0,
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
+        init_weights: bool = True,
         **kwargs,
     ):
-        init_weights = kwargs.pop("init_weights", True)
-        self._init_empty_weights(nn.Linear, in_features, out_features, bias, device=device, dtype=dtype)
-
-        LoHaLayer.__init__(self)
+        super().__init__(base_layer)
 
         # Create adapter and set it active
+        self._active_adapter = adapter_name
         self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
-        self.set_adapter(adapter_name)
 
-    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, weight, bias=self.bias)
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        return F.linear(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "loha." + rep
 
 
-class Conv2d(LoHaLayer, nn.Conv2d):
+class Conv2d(LoHaLayer):
     """LoHa implemented in Conv2d layer"""
 
     def __init__(
         self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]] = 1,
-        padding: Union[int, Tuple[int]] = 0,
-        dilation: int = 1,
-        groups: int = 1,
-        bias: bool = True,
-        padding_mode: str = "zeros",
-        device: Optional[Union[str, torch.device]] = None,
-        dtype: Optional[torch.dtype] = None,
+        base_layer: nn.Module,
         adapter_name: str = "default",
         r: int = 0,
         alpha: float = 0.0,
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
         use_effective_conv2d: bool = False,
+        init_weights: bool = True,
         **kwargs,
     ):
-        init_weights = kwargs.pop("init_weights", True)
-        self._init_empty_weights(
-            nn.Conv2d,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-            padding_mode=padding_mode,
-            device=device,
-            dtype=dtype,
-        )
-
-        LoHaLayer.__init__(self)
+        super().__init__(base_layer)
 
         # Create adapter and set it active
+        self._active_adapter = adapter_name
         self.update_layer(
             adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
         )
-        self.set_adapter(adapter_name)
 
-    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        base_layer = self.get_base_layer()
         return F.conv2d(
             input,
-            weight,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            groups=self.groups,
+            delta_weight,
+            stride=base_layer.stride,
+            padding=base_layer.padding,
+            dilation=base_layer.dilation,
+            groups=base_layer.groups,
         )
 
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "loha." + rep
+
 
 # Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9
 
diff --git a/src/peft/tuners/loha/model.py b/src/peft/tuners/loha/model.py
index 92d5b887ef..e641fdbac7 100644
--- a/src/peft/tuners/loha/model.py
+++ b/src/peft/tuners/loha/model.py
@@ -13,11 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Type
+import re
+from itertools import chain
+from typing import Dict, Type, Union
 
 import torch
+from torch import nn
+
+from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
 
-from ..lycoris_utils import LycorisTuner
 from .layer import Conv2d, Linear, LoHaLayer
 
 
@@ -82,3 +86,31 @@ class LoHaModel(LycorisTuner):
         torch.nn.Conv2d: Conv2d,
         torch.nn.Linear: Linear,
     }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LoHaLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+        **optional_kwargs,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LoHaLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/src/peft/tuners/lokr/layer.py b/src/peft/tuners/lokr/layer.py
index 97f3afb6fd..c733f4f4a5 100644
--- a/src/peft/tuners/lokr/layer.py
+++ b/src/peft/tuners/lokr/layer.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Set, Tuple, Union
+from typing import Any, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -23,7 +23,7 @@
 from peft.tuners.lycoris_utils import LycorisLayer
 
 
-class LoKrLayer(LycorisLayer, nn.Module):
+class LoKrLayer(nn.Module, LycorisLayer):
     # All names of layers that may contain adapter weights
     adapter_layer_names = (
         "lokr_w1",
@@ -36,9 +36,9 @@ class LoKrLayer(LycorisLayer, nn.Module):
     )
     # other_param_names is defined on parent class
 
-    def __init__(self):
-        LycorisLayer.__init__(self)
-        super(nn.Module, self).__init__()
+    def __init__(self, base_layer: nn.Module) -> None:
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
 
         # LoKr info
         self.lokr_w1 = nn.ParameterDict({})
@@ -111,6 +111,22 @@ def reset_adapter_parameters(self, adapter_name: str):
         if adapter_name in self.lokr_t2:
             nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
 
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        if adapter_name in self.lokr_w1:
+            nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_w2:
+            nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_t2:
+            nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
+
     def update_layer(
         self,
         adapter_name: str,
@@ -143,10 +159,11 @@ def update_layer(
         self.scaling[adapter_name] = alpha / r
         self.rank_dropout[adapter_name] = rank_dropout
         self.module_dropout[adapter_name] = module_dropout
+        base_layer = self.get_base_layer()
 
         # Determine shape of LoKr weights
-        if isinstance(self, nn.Linear):
-            in_dim, out_dim = self.in_features, self.out_features
+        if isinstance(base_layer, nn.Linear):
+            in_dim, out_dim = base_layer.in_features, base_layer.out_features
 
             in_m, in_n = factorization(in_dim, decompose_factor)
             out_l, out_k = factorization(out_dim, decompose_factor)
@@ -155,9 +172,9 @@ def update_layer(
             use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
             use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2)
             use_effective_conv2d = False
-        elif isinstance(self, nn.Conv2d):
-            in_dim, out_dim = self.in_channels, self.out_channels
-            k_size = self.kernel_size
+        elif isinstance(base_layer, nn.Conv2d):
+            in_dim, out_dim = base_layer.in_channels, base_layer.out_channels
+            k_size = base_layer.kernel_size
 
             in_m, in_n = factorization(in_dim, decompose_factor)
             out_l, out_k = factorization(out_dim, decompose_factor)
@@ -165,9 +182,9 @@ def update_layer(
 
             use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
             use_w2 = r >= max(shape[0][1], shape[1][1]) / 2
-            use_effective_conv2d = use_effective_conv2d and self.kernel_size != (1, 1)
+            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1)
         else:
-            raise TypeError(f"LoKr is not implemented for {type(self).__name__} layer")
+            raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}")
 
         # Create weights with provided shape
         self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d)
@@ -175,9 +192,11 @@ def update_layer(
         # Initialize weights
         if init_weights:
             self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
 
         # Move new weights to device
-        weight = getattr(self, "weight", None)
+        weight = getattr(self.get_base_layer(), "weight", None)
         if weight is not None:
             # the layer is already completely initialized, this is an update
             if weight.dtype.is_floating_point or weight.dtype.is_complex:
@@ -202,7 +221,7 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
 
         # Make weights with Kronecker product
         weight = make_kron(w1, w2)
-        weight = weight.reshape(self.weight.shape)
+        weight = weight.reshape(self.get_base_layer().weight.shape)
 
         # Perform rank dropout during training - drop rows of addition weights
         rank_dropout = self.rank_dropout[adapter_name]
@@ -214,15 +233,39 @@ def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
 
         return weight
 
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs)
+
+        result = result.to(previous_dtype)
+        return result
 
-class Linear(LoKrLayer, nn.Linear):
+
+class Linear(LoKrLayer):
     """LoKr implemented in Linear layer"""
 
     def __init__(
         self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
+        base_layer: nn.Module,
         device: Optional[Union[str, torch.device]] = None,
         dtype: Optional[torch.dtype] = None,
         adapter_name: str = "default",
@@ -230,35 +273,33 @@ def __init__(
         alpha: float = 0.0,
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
+        init_weights: bool = True,
         **kwargs,
     ):
-        init_weights = kwargs.pop("init_weights", True)
-        self._init_empty_weights(nn.Linear, in_features, out_features, bias, device=device, dtype=dtype)
-
-        LoKrLayer.__init__(self)
+        super().__init__(base_layer)
 
         # Create adapter and set it active
+        self._active_adapter = adapter_name
         self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
-        self.set_adapter(adapter_name)
 
-    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, weight, bias=self.bias)
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        return F.linear(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lokr." + rep
 
 
-class Conv2d(LoKrLayer, nn.Conv2d):
+class Conv2d(LoKrLayer):
     """LoKr implemented in Conv2d layer"""
 
     def __init__(
         self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]] = 1,
-        padding: Union[int, Tuple[int]] = 0,
-        dilation: int = 1,
-        groups: int = 1,
-        bias: bool = True,
-        padding_mode: str = "zeros",
+        base_layer: nn.Module,
         device: Optional[Union[str, torch.device]] = None,
         dtype: Optional[torch.dtype] = None,
         adapter_name: str = "default",
@@ -267,43 +308,36 @@ def __init__(
         rank_dropout: float = 0.0,
         module_dropout: float = 0.0,
         use_effective_conv2d: bool = False,
+        init_weights: bool = True,
         **kwargs,
     ):
-        init_weights = kwargs.pop("init_weights", True)
-        self._init_empty_weights(
-            nn.Conv2d,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-            padding_mode=padding_mode,
-            device=device,
-            dtype=dtype,
-        )
-
-        LoKrLayer.__init__(self)
+        super().__init__(base_layer)
 
         # Create adapter and set it active
+        self._active_adapter = adapter_name
         self.update_layer(
             adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
         )
-        self.set_adapter(adapter_name)
 
-    def _op(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        base_layer = self.get_base_layer()
         return F.conv2d(
             input,
-            weight,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            groups=self.groups,
+            delta_weight,
+            stride=base_layer.stride,
+            padding=base_layer.padding,
+            dilation=base_layer.dilation,
+            groups=base_layer.groups,
         )
 
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lokr." + rep
+
 
 # Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11
 
diff --git a/src/peft/tuners/lokr/model.py b/src/peft/tuners/lokr/model.py
index e08b7a7c48..61535b28b3 100644
--- a/src/peft/tuners/lokr/model.py
+++ b/src/peft/tuners/lokr/model.py
@@ -13,11 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Type
+import re
+from itertools import chain
+from typing import Dict, Type, Union
 
 import torch
+from torch import nn
+
+from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
 
-from ..lycoris_utils import LycorisTuner
 from .layer import Conv2d, Linear, LoKrLayer
 
 
@@ -83,3 +87,31 @@ class LoKrModel(LycorisTuner):
         torch.nn.Conv2d: Conv2d,
         torch.nn.Linear: Linear,
     }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LoKrLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+        **optional_kwargs,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LoKrLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/src/peft/tuners/lora/bnb.py b/src/peft/tuners/lora/bnb.py
index 4bd8151ed3..1c42a9e8e3 100644
--- a/src/peft/tuners/lora/bnb.py
+++ b/src/peft/tuners/lora/bnb.py
@@ -30,20 +30,18 @@ class Linear8bitLt(torch.nn.Module, LoraLayer):
         # Lora implemented in a dense layer
         def __init__(
             self,
-            adapter_name,
-            base_layer,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
             r: int = 0,
             lora_alpha: int = 1,
             lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
             **kwargs,
         ) -> None:
             super().__init__()
-            LoraLayer.__init__(self, in_features=base_layer.in_features, out_features=base_layer.out_features)
-            self.base_layer = base_layer
+            LoraLayer.__init__(self, base_layer)
 
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.set_adapter(adapter_name)
 
         def merge(self, safe_merge: bool = False):
             """
@@ -69,8 +67,8 @@ def merge(self, safe_merge: bool = False):
                 )
                 lora_data = self.get_delta_weight(active_adapter)
 
-                weight = self.base_layer.weight
-                state = self.base_layer.state
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
                 if state.SCB is None:
                     state.SCB = weight.SCB
 
@@ -90,7 +88,7 @@ def merge(self, safe_merge: bool = False):
                         f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                     )
 
-                self.base_layer.weight = bnb.nn.Int8Params(
+                self.get_base_layer().weight = bnb.nn.Int8Params(
                     w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
                 ).to(weight.device)
                 state.reset_grads()
@@ -110,8 +108,8 @@ def unmerge(self):
                 )
                 lora_data = self.get_delta_weight(active_adapter)
 
-                weight = self.base_layer.weight
-                state = self.base_layer.state
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
                 if state.SCB is None:
                     state.SCB = weight.SCB
                 im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
@@ -124,7 +122,7 @@ def unmerge(self):
                 output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
 
                 w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data
-                self.base_layer.weight = bnb.nn.Int8Params(
+                self.get_base_layer().weight = bnb.nn.Int8Params(
                     w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
                 ).to(weight.device)
                 state.reset_grads()
@@ -169,6 +167,10 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
 
             return result
 
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
 
 if is_bnb_4bit_available():
 
@@ -176,20 +178,18 @@ class Linear4bit(torch.nn.Module, LoraLayer):
         # Lora implemented in a dense layer
         def __init__(
             self,
-            adapter_name,
-            base_layer,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
             r: int = 0,
             lora_alpha: int = 1,
             lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
             **kwargs,
         ) -> None:
             super().__init__()
-            LoraLayer.__init__(self, in_features=base_layer.in_features, out_features=base_layer.out_features)
-            self.base_layer = base_layer
+            LoraLayer.__init__(self, base_layer)
 
-            init_lora_weights = kwargs.pop("init_lora_weights", True)
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-            self.set_adapter(adapter_name)
 
         def merge(self, safe_merge: bool = False):
             """
@@ -214,7 +214,7 @@ def merge(self, safe_merge: bool = False):
                     "Merge lora module to 4-bit linear may get different generations due to rounding errors."
                 )
                 # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930
-                weight = self.base_layer.weight
+                weight = self.get_base_layer().weight
                 kwargs = weight.__dict__
                 lora_data = self.get_delta_weight(active_adapter)
 
@@ -224,7 +224,7 @@ def merge(self, safe_merge: bool = False):
                         f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                     )
 
-                self.base_layer.weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
                     weight.device
                 )
                 self.merged_adapters.append(active_adapter)
@@ -241,11 +241,11 @@ def unmerge(self):
                 warnings.warn(
                     "Unmerge lora module to 4-bit linear may get different generations due to rounding errors."
                 )
-                weight = self.base_layer.weight
+                weight = self.get_base_layer().weight
                 kwargs = weight.__dict__
                 lora_data = self.get_delta_weight(active_adapter)
                 w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - lora_data
-                self.base_layer.weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
                     weight.device
                 )
 
@@ -262,11 +262,11 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
             if self.disable_adapters:
                 if self.merged:
                     self.unmerge()
-                result = self.base_layer.forward(x, *args, **kwargs)
+                result = self.base_layer(x, *args, **kwargs)
             elif self.merged:
-                result = self.base_layer.forward(x, *args, **kwargs)
+                result = self.base_layer(x, *args, **kwargs)
             else:
-                result = self.base_layer.forward(x, *args, **kwargs)
+                result = self.base_layer(x, *args, **kwargs)
                 # As per Tim Dettmers, for 4bit, we need to defensively clone here.
                 # The reason is that in some cases, an error can occur that backprop
                 # does not work on a manipulated view. This issue may be solved with
@@ -294,3 +294,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                     result += output
 
             return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
index 1505045a3e..75c853184c 100644
--- a/src/peft/tuners/lora/gptq.py
+++ b/src/peft/tuners/lora/gptq.py
@@ -21,22 +21,21 @@
 class QuantLinear(torch.nn.Module, LoraLayer):
     def __init__(
         self,
-        adapter_name,
-        quant_linear_module,
+        base_layer,
+        adapter_name: str,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
         **kwargs,
     ):
-        torch.nn.Module.__init__(self)
-        LoraLayer.__init__(
-            self, in_features=quant_linear_module.infeatures, out_features=quant_linear_module.outfeatures
-        )
-        self.quant_linear_module = quant_linear_module
-        self.weight = quant_linear_module.qweight
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+        # for backwards compatibility
+        self.quant_linear_module = base_layer
         self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.set_adapter(adapter_name)
 
     def forward(self, x: torch.Tensor):
         # note: logic differs from default Linear because merging is not supported
@@ -65,6 +64,10 @@ def forward(self, x: torch.Tensor):
             result += output
         return result
 
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
     # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
     # def reset_lora_parameters(self, adapter_name):
     #     if adapter_name in self.lora_A.keys():
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index e2ced1eee9..c263053183 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -15,11 +15,12 @@
 
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers.pytorch_utils import Conv1D
 
 from peft.tuners.tuners_utils import BaseTunerLayer
 from peft.utils.other import transpose
@@ -31,7 +32,8 @@ class LoraLayer(BaseTunerLayer):
     # All names of other parameters that may contain adapter-related parameters
     other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
 
-    def __init__(self, in_features: int, out_features: int, **kwargs):
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        self.base_layer = base_layer
         self.r = {}
         self.lora_alpha = {}
         self.scaling = {}
@@ -44,21 +46,26 @@ def __init__(self, in_features: int, out_features: int, **kwargs):
         # Mark the weight as unmerged
         self._disable_adapters = False
         self.merged_adapters = []
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            in_features, out_features = base_layer.in_channels, base_layer.out_channels
+        elif isinstance(base_layer, nn.Embedding):
+            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
+        elif isinstance(base_layer, Conv1D):
+            in_features, out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+        elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"):
+            # QuantLinear
+            in_features, out_features = base_layer.infeatures, base_layer.outfeatures
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
         self.in_features = in_features
         self.out_features = out_features
-        self.kwargs = kwargs
-
-    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
-        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
-        # model weights. The implementation is inspired by
-        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
-        # directly.
-        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
-        # omitting important logic inside that __init__.
-        kwargs = kwargs.copy()
-        final_device = kwargs.pop("device", "cpu")
-        cls.__init__(self, *args, device="meta", **kwargs)
-        self.to_empty(device=final_device)
 
     def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
         if r <= 0:
@@ -79,7 +86,7 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
         if init_lora_weights:
             self.reset_lora_parameters(adapter_name)
 
-        weight = getattr(self, "weight", None)
+        weight = getattr(self.get_base_layer(), "weight", None)
         if weight is not None:
             # the layer is already completely initialized, this is an update
             if weight.dtype.is_floating_point or weight.dtype.is_complex:
@@ -100,20 +107,22 @@ def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lo
 
         self.lora_dropout[adapter_name] = lora_dropout_layer
         # Actual trainable parameters
+        base_layer = self.get_base_layer()
         if r > 0:
-            kernel_size = self.kwargs["kernel_size"]
-            stride = self.kwargs["stride"]
-            padding = self.kwargs["padding"]
+            kernel_size = base_layer.kernel_size
+            stride = base_layer.stride
+            padding = base_layer.padding
             self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
             self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
             self.scaling[adapter_name] = lora_alpha / r
         if init_lora_weights:
             self.reset_lora_parameters(adapter_name)
 
-        weight = getattr(self, "weight", None)
+        weight = getattr(base_layer, "weight", None)
         if weight is not None:
             # the layer is already completely initialized, this is an update
-            self.to(self.weight.device, dtype=weight.dtype)
+            self.to(base_layer.weight.device, dtype=weight.dtype)
+        self.set_adapter(self.active_adapters)
 
     def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
         if r <= 0:
@@ -136,10 +145,12 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
         if init_lora_weights:
             self.reset_lora_parameters(adapter_name)
 
-        weight = getattr(self, "weight", None)
+        base_layer = self.get_base_layer()
+        weight = getattr(base_layer, "weight", None)
         if weight is not None:
             # the layer is already completely initialized, this is an update
-            self.to(self.weight.device, dtype=weight.dtype)
+            self.to(base_layer.weight.device, dtype=weight.dtype)
+        self.set_adapter(self.active_adapters)
 
     def reset_lora_parameters(self, adapter_name):
         if adapter_name in self.lora_A.keys():
@@ -188,35 +199,27 @@ def unscale_layer(self, scale=None) -> None:
 #  ------------------------------------------------------------------------------------------
 
 
-class Linear(nn.Linear, LoraLayer):
+class Linear(nn.Module, LoraLayer):
     # Lora implemented in a dense layer
     def __init__(
         self,
+        base_layer,
         adapter_name: str,
-        in_features: int,
-        out_features: int,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_target_conv_1d_layer: bool = False,
+        init_lora_weights: bool = True,
         **kwargs,
     ) -> None:
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        # this gets the init from nn.Linear's super perspective, i.e.
-        # nn.Module.__init__, which should always be called
-        super(nn.Linear, self).__init__()
-        # Note that we don't use self._init_empty_weights() for Linear because it is a bit slower and the benefit of
-        # added robustness is not big enough for Linear.
-
-        LoraLayer.__init__(self, in_features=in_features, out_features=out_features)
-        # Freezing the pre-trained weight matrix
-
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
         self.fan_in_fan_out = fan_in_fan_out
 
+        self._active_adapter = adapter_name
         self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
-        self.set_adapter(adapter_name)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
@@ -242,10 +245,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
 
         for active_adapter in adapter_names:
             if active_adapter in self.lora_A.keys():
+                base_layer = self.get_base_layer()
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
                     # because of the copy operation.
-                    orig_weights = self.weight.data.clone()
+                    orig_weights = base_layer.weight.data.clone()
                     orig_weights += self.get_delta_weight(active_adapter)
 
                     if not torch.isfinite(orig_weights).all():
@@ -253,9 +257,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
 
-                    self.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights
                 else:
-                    self.weight.data += self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -265,7 +269,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.lora_A.keys():
-                self.weight.data -= self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
         """
@@ -301,20 +305,17 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
 
         return output_tensor
 
-    def _linear(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         previous_dtype = x.dtype
 
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
-            result = self._linear(x)
+            result = self.base_layer(x, *args, **kwargs)
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.lora_A.keys():
                     continue
@@ -328,24 +329,28 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         result = result.to(previous_dtype)
         return result
 
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
 
-class Embedding(nn.Embedding, LoraLayer):
+
+class Embedding(nn.Module, LoraLayer):
     # LoRA implemented in a Embedding layer
     def __init__(
         self,
+        base_layer: nn.Module,
         adapter_name: str,
-        num_embeddings: int,
-        embedding_dim: int,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
         **kwargs,
     ) -> None:
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        self._init_empty_weights(nn.Embedding, num_embeddings, embedding_dim, **kwargs)
-        LoraLayer.__init__(self, in_features=num_embeddings, out_features=embedding_dim)
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        self._active_adapter = adapter_name
         self.update_layer_embedding(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.set_adapter(adapter_name)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
@@ -371,10 +376,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
 
         for active_adapter in adapter_names:
             if active_adapter in self.lora_embedding_A.keys():
+                base_layer = self.get_base_layer()
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
                     # because of the copy operation.
-                    orig_weights = self.weight.data.copy()
+                    orig_weights = base_layer.weight.data.copy()
                     orig_weights += self.get_delta_weight(active_adapter)
 
                     if not torch.isfinite(orig_weights).all():
@@ -382,9 +388,9 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
 
-                    self.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights
                 else:
-                    self.weight.data += self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -394,7 +400,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.lora_embedding_A.keys():
-                self.weight.data -= self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
         """
@@ -430,28 +436,28 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
 
         return output_tensor
 
-    def _embed(self, input: torch.Tensor, weight: Optional[torch.Tensor] = None) -> torch.Tensor:
-        weight = self.weight if weight is None else weight
+    def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        base_layer = self.get_base_layer()
         return F.embedding(
             input,
             weight,
-            padding_idx=self.padding_idx,
-            max_norm=self.max_norm,
-            norm_type=self.norm_type,
-            scale_grad_by_freq=self.scale_grad_by_freq,
-            sparse=self.sparse,
+            padding_idx=base_layer.padding_idx,
+            max_norm=base_layer.max_norm,
+            norm_type=base_layer.norm_type,
+            scale_grad_by_freq=base_layer.scale_grad_by_freq,
+            sparse=base_layer.sparse,
         )
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         # TODO: no dtype conversion here, unlike in Linear, is that correct?
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._embed(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._embed(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
-            result = self._embed(x)
+            result = self.base_layer(x, *args, **kwargs)
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.lora_embedding_A:
                     continue
@@ -463,36 +469,28 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return result
 
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
 
-class Conv2d(nn.Conv2d, LoraLayer):
+
+class Conv2d(nn.Module, LoraLayer):
     # Lora implemented in a conv2d layer
     def __init__(
         self,
+        base_layer: nn.Module,
         adapter_name: str,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int]],
-        stride: Union[int, Tuple[int]] = 1,
-        padding: Union[int, Tuple[int]] = 0,
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
         **kwargs,
     ) -> None:
-        init_lora_weights = kwargs.pop("init_lora_weights", True)
-        self._init_empty_weights(nn.Conv2d, in_channels, out_channels, kernel_size, stride=stride, padding=padding)
-
-        LoraLayer.__init__(
-            self,
-            in_features=in_channels,
-            out_features=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-        )
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
 
+        self._active_adapter = adapter_name
         self.update_layer_conv2d(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
-        self.set_adapter(adapter_name)
 
     def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         """
@@ -518,19 +516,20 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
 
         for active_adapter in adapter_names:
             if active_adapter in self.lora_A.keys():
+                base_layer = self.get_base_layer()
                 if safe_merge:
                     # Note that safe_merge will be slower than the normal merge
                     # because of the copy operation.
-                    orig_weights = self.weight.data.copy()
+                    orig_weights = base_layer.weight.data.copy()
                     orig_weights += self.get_delta_weight(active_adapter)
 
                     if not torch.isfinite(orig_weights).all():
                         raise ValueError(
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
-                    self.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights
                 else:
-                    self.weight.data += self.get_delta_weight(active_adapter)
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
 
     def unmerge(self) -> None:
@@ -540,7 +539,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self.lora_A.keys():
-                self.weight.data -= self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
         """
@@ -566,7 +565,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
             weight_B = weight_B.float()
 
         # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117
-        if self.weight.size()[2:4] == (1, 1):
+        if self.get_base_layer().weight.size()[2:4] == (1, 1):
             # conv2d 1x1
             output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(
                 3
@@ -590,28 +589,17 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
 
         return output_tensor
 
-    def _conv2d(self, input: torch.Tensor) -> torch.Tensor:
-        return F.conv2d(
-            input,
-            self.weight,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-            dilation=self.dilation,
-            groups=self.groups,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         previous_dtype = x.dtype
 
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
-            result = self._conv2d(x)
+            result = self.base_layer(x, *args, **kwargs)
         elif self.merged:
-            result = self._conv2d(x)
+            result = self.base_layer(x, *args, **kwargs)
         else:
-            result = self._conv2d(x)
+            result = self.base_layer(x, *args, **kwargs)
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.lora_A.keys():
                     continue
@@ -624,3 +612,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         result = result.to(previous_dtype)
         return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 6b76ad9b6c..a5b7735ce3 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -22,7 +22,6 @@
 from typing import List, Optional
 
 import torch
-from torch import nn
 from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
@@ -108,6 +107,8 @@ class LoraModel(BaseTuner):
         - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
     """
 
+    prefix: str = "lora_"
+
     def __init__(self, model, config, adapter_name) -> None:
         super().__init__(model, config, adapter_name)
 
@@ -165,7 +166,7 @@ def _create_and_replace(
             kwargs["gptq_quantization_config"] = quantization_config
 
         # TODO: better deal with that
-        if isinstance(target, LoraLayer) and isinstance(target, torch.nn.Conv2d):
+        if isinstance(target, Conv2d):
             target.update_layer_conv2d(
                 adapter_name,
                 r,
@@ -173,7 +174,7 @@ def _create_and_replace(
                 lora_config.lora_dropout,
                 lora_config.init_lora_weights,
             )
-        elif isinstance(target, LoraLayer) and isinstance(target, torch.nn.Embedding):
+        elif isinstance(target, Embedding):
             target.update_layer_embedding(
                 adapter_name,
                 r,
@@ -181,8 +182,7 @@ def _create_and_replace(
                 lora_config.lora_dropout,
                 lora_config.init_lora_weights,
             )
-
-        elif isinstance(target, LoraLayer):
+        elif isinstance(target, Linear):
             target.update_layer(
                 adapter_name,
                 r,
@@ -197,8 +197,7 @@ def _create_and_replace(
                 new_module.requires_grad_(False)
             self._replace_module(parent, target_name, new_module, target)
 
-    @staticmethod
-    def _replace_module(parent, child_name, new_module, child):
+    def _replace_module(self, parent, child_name, new_module, child):
         setattr(parent, child_name, new_module)
         # It's not necessary to set requires_grad here, as that is handled by
         # _mark_only_adapters_as_trainable
@@ -206,10 +205,7 @@ def _replace_module(parent, child_name, new_module, child):
         # child layer wraps the original module, unpack it
         if hasattr(child, "base_layer"):
             child = child.base_layer
-        elif hasattr(child, "quant_linear_module"):
-            child = child.quant_linear_module
 
-        # TODO: layers with base_layer don't need the weight to be copied, as they have a reference already
         if not hasattr(new_module, "base_layer"):
             new_module.weight = child.weight
             if hasattr(child, "bias"):
@@ -224,14 +220,13 @@ def _replace_module(parent, child_name, new_module, child):
 
         # dispatch to correct device
         for name, module in new_module.named_modules():
-            if "lora_" in name:
-                module.to(child.weight.device)
-            if "ranknum" in name:
-                module.to(child.weight.device)
+            if (self.prefix in name) or ("ranknum" in name):
+                weight = child.qweight if hasattr(child, "qweight") else child.weight
+                module.to(weight.device)
 
     def _mark_only_adapters_as_trainable(self) -> None:
         for n, p in self.model.named_parameters():
-            if "lora_" not in n:
+            if self.prefix not in n:
                 p.requires_grad = False
 
         for active_adapter in self.active_adapters:
@@ -257,9 +252,13 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
 
         loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
         loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
-        bias = kwargs.pop("bias", False)
 
-        if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
             eightbit_kwargs = kwargs.copy()
             eightbit_kwargs.update(
                 {
@@ -269,8 +268,8 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "index": target.index,
                 }
             )
-            new_module = Linear8bitLt(adapter_name, target, **eightbit_kwargs)
-        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target, bnb.nn.Linear4bit):
+            new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs)
+        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit):
             fourbit_kwargs = kwargs.copy()
             fourbit_kwargs.update(
                 {
@@ -279,47 +278,37 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "quant_type": target.weight.quant_type,
                 }
             )
-            new_module = Linear4bit(adapter_name, target, **fourbit_kwargs)
-        elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
-            new_module = QuantLinear(adapter_name, target, **kwargs)
+            new_module = Linear4bit(target, adapter_name, **fourbit_kwargs)
+        elif AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear):
+            new_module = QuantLinear(target, adapter_name, **kwargs)
             target.weight = target.qweight
-        elif isinstance(target, torch.nn.Embedding):
+        elif isinstance(target_base_layer, torch.nn.Embedding):
             embedding_kwargs = kwargs.copy()
             embedding_kwargs.pop("fan_in_fan_out", None)
-            in_features, out_features = target.num_embeddings, target.embedding_dim
-            new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
-        elif isinstance(target, torch.nn.Conv2d):
-            out_channels, in_channels = target.weight.size()[:2]
-            kernel_size = target.weight.size()[2:]
-            stride = target.stride
-            padding = target.padding
-            new_module = Conv2d(adapter_name, in_channels, out_channels, kernel_size, stride, padding, **kwargs)
-        else:
-            if isinstance(target, torch.nn.Linear):
-                in_features, out_features = target.in_features, target.out_features
-                if kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
-                        "Setting fan_in_fan_out to False."
-                    )
-                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
-            elif isinstance(target, Conv1D):
-                in_features, out_features = (
-                    target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
+            new_module = Embedding(target, adapter_name, **embedding_kwargs)
+        elif isinstance(target_base_layer, torch.nn.Conv2d):
+            new_module = Conv2d(target, adapter_name, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
                 )
-                kwargs["is_target_conv_1d_layer"] = True
-                if not kwargs["fan_in_fan_out"]:
-                    warnings.warn(
-                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
-                        "Setting fan_in_fan_out to True."
-                    )
-                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
-            else:
-                raise ValueError(
-                    f"Target module {target} is not supported. Currently, only the following modules are supported: "
-                    "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`."
+                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            new_module = Linear(target, adapter_name, **kwargs)
+        elif isinstance(target_base_layer, Conv1D):
+            if not kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                    "Setting fan_in_fan_out to True."
                 )
-            new_module = Linear(adapter_name, in_features, out_features, bias=bias, **kwargs)
+                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs)
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`."
+            )
 
         return new_module
 
@@ -388,60 +377,20 @@ def _unload_and_optionally_merge(
             if getattr(self.model, "quantization_method", None) == "gptq":
                 raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
 
-        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
         desc = "Unloading " + ("and merging " if merge else "") + "model"
         for key in tqdm(key_list, disable=not progressbar, desc=desc):
             try:
                 parent, target, target_name = _get_submodules(self.model, key)
             except AttributeError:
                 continue
-            if isinstance(target, LoraLayer):
-                if isinstance(target, nn.Embedding):
-                    new_module = torch.nn.Embedding(target.in_features, target.out_features)
-                elif isinstance(target, nn.Conv2d):
-                    new_module = torch.nn.Conv2d(
-                        target.in_channels,
-                        target.out_channels,
-                        kernel_size=target.kernel_size,
-                        stride=target.stride,
-                        padding=target.padding,
-                        dilation=target.dilation,
-                    )
-                elif is_bnb_available() and isinstance(target, Linear8bitLt):
-                    bias = target.base_layer.bias is not None
-                    new_module = bnb.nn.Linear8bitLt(
-                        target.in_features,
-                        target.out_features,
-                        bias=bias,
-                        has_fp16_weights=target.base_layer.state.has_fp16_weights,
-                        memory_efficient_backward=target.base_layer.state.memory_efficient_backward,
-                        threshold=target.base_layer.state.threshold,
-                        index=target.base_layer.index,
-                        device=target.base_layer.weight.device,
-                    )
-                elif is_bnb_4bit_available() and isinstance(target, Linear4bit):
-                    bias = target.base_layer.bias is not None
-                    new_module = bnb.nn.Linear4bit(
-                        target.in_features,
-                        target.out_features,
-                        bias=bias,
-                        compute_dtype=target.base_layer.compute_dtype,
-                        compress_statistics=target.base_layer.weight.compress_statistics,
-                        quant_type=target.base_layer.weight.quant_type,
-                        device=target.base_layer.weight.device,
-                    )
-                else:
-                    bias = target.bias is not None
-                    if getattr(target, "is_target_conv_1d_layer", False):
-                        new_module = Conv1D(target.out_features, target.in_features)
-                    else:
-                        new_module = torch.nn.Linear(target.in_features, target.out_features, bias=bias)
+
+            if hasattr(target, "base_layer"):
                 if merge:
                     target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
-                self._replace_module(parent, target_name, new_module, target)
-
-            # save any additional trainable modules part of `modules_to_save`
-            if isinstance(target, ModulesToSaveWrapper):
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
                 setattr(parent, target_name, target.modules_to_save[target.active_adapter])
 
         return self.model
@@ -543,7 +492,7 @@ def add_weighted_adapter(
         # Do we really need that?
         _freeze_adapter(self.model, adapter_name)
 
-        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
         for key in key_list:
             _, target, _ = _get_submodules(self.model, key)
             if isinstance(target, LoraLayer):
@@ -667,7 +616,7 @@ def delete_adapter(self, adapter_name: str):
             raise ValueError(f"Adapter {adapter_name} does not exist")
         del self.peft_config[adapter_name]
 
-        key_list = [key for key, _ in self.model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
         new_adapter = None
         for key in key_list:
             _, target, _ = _get_submodules(self.model, key)
diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py
index d3085c4831..5865887506 100644
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@@ -13,12 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 import warnings
 from abc import abstractmethod
 from dataclasses import dataclass, field
-from itertools import chain
-from typing import Dict, List, Optional, Set, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type, Union
 
 import torch
 import torch.nn as nn
@@ -58,14 +56,15 @@ class LycorisConfig(PeftConfig):
     )
 
 
-class LycorisLayer(BaseTunerLayer, nn.Module):
+class LycorisLayer(BaseTunerLayer):
     r"""
     A base layer for LyCORIS like adapters
     """
     # adapter_layer_names needs to be defined on the child class
     other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
 
-    def __init__(self):
+    def __init__(self, base_layer: nn.Module) -> None:
+        self.base_layer = base_layer
         self.r = {}
         self.alpha = {}
         self.scaling = {}
@@ -93,48 +92,20 @@ def _init_empty_weights(self, cls, *args, **kwargs) -> None:
         cls.__init__(self, *args, device="meta", **kwargs)
         self.to_empty(device=final_device)
 
-    def _op(self, x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
     @abstractmethod
     def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs):
         ...
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        previous_dtype = x.dtype
-
-        if self.disable_adapters:
-            if self.merged:
-                self.unmerge()
-            result = self._op(x, self.weight)
-        elif self.merged:
-            result = self._op(x, self.weight)
-        else:
-            # Get base weights
-            weight = self.weight.data
-
-            # Execute all the adapters
-            for active_adapter in self.active_adapters:
-                if active_adapter not in self._available_adapters:
-                    continue
-
-                module_dropout = self.module_dropout[active_adapter]
-
-                # Modify current execution weights
-                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
-                    weight = weight + self.get_delta_weight(active_adapter)
-
-            # Perform actual operation
-            result = self._op(x, weight)
-
-        result = result.to(previous_dtype)
-        return result
+    # TODO: refactor LoRA to use the same approach
+    @abstractmethod
+    def _get_delta_activations(self, adapter_name: str, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Activations added on top of the base layer output (i.e. after the base layer forward pass)"""
 
     @abstractmethod
     def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
         ...
 
-    def merge(self, adapter_names: Optional[List[str]] = None) -> None:
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
         if self.merged:
             warnings.warn(
                 f"Already following adapters were merged {','.join(self.merged_adapters)}. "
@@ -145,7 +116,20 @@ def merge(self, adapter_names: Optional[List[str]] = None) -> None:
 
         for active_adapter in adapter_names:
             if active_adapter in self._available_adapters:
-                self.weight.data += self.get_delta_weight(active_adapter)
+                base_layer = self.get_base_layer()
+
+                if safe_merge:
+                    orig_weights = base_layer.weight.data
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
                 self.merged_adapters.append(active_adapter)
 
     @abstractmethod
@@ -175,7 +159,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self._available_adapters:
-                self.weight.data -= self.get_delta_weight(active_adapter)
+                self.base_layer.weight.data -= self.get_delta_weight(active_adapter)
 
     def unscale_layer(self, scale=None) -> None:
         for active_adapter in self.active_adapters:
@@ -214,6 +198,7 @@ def __getattr__(self, name: str):
     def _check_target_module_exists(config, key):
         return check_target_module_exists(config, key)
 
+    @abstractmethod
     def _create_and_replace(
         self,
         config: LycorisConfig,
@@ -224,68 +209,47 @@ def _create_and_replace(
         current_key,
         **optional_kwargs,
     ):
-        """
-        A private method to create and replace the target module with the adapter module.
-        """
-
-        # Regexp matching - Find key which matches current target_name in patterns provided
-        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
-        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
-
-        kwargs = config.to_dict()
-        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
-        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
-
-        if isinstance(target, LycorisLayer):
-            target.update_layer(adapter_name, **kwargs)
-        else:
-            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
-            self._replace_module(parent, target_name, new_module, target)
+        ...
 
     @classmethod
     def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer:
         # Find corresponding subtype of provided target module
         new_module_cls = None
         for subtype, target_cls in cls.layers_mapping.items():
-            if isinstance(target, subtype):
+            if (
+                hasattr(target, "base_layer")
+                and isinstance(target.get_base_layer(), subtype)
+                and isinstance(target, BaseTunerLayer)
+            ):
+                # nested tuner layers are allowed
+                new_module_cls = target_cls
+                break
+            elif isinstance(target, subtype):
                 new_module_cls = target_cls
                 break
 
         # We didn't find corresponding type, so adapter for this layer is not supported
         if new_module_cls is None:
+            supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys())
             raise ValueError(
-                f"Target module not found, currently only adapters for {', '.join([x.__name__ for x in cls.modules_mapping.keys()])} are supported"
+                f"Target module of type {type(target)} not supported, "
+                f"currently only adapters for {supported_modules} are supported"
             )
 
-        if isinstance(target, torch.nn.Conv2d):
-            new_module = new_module_cls(
-                target.in_channels,
-                target.out_channels,
-                target.weight.size()[2:],
-                stride=target.stride,
-                padding=target.padding,
-                dilation=target.dilation,
-                groups=target.groups,
-                bias=target.bias is not None,
-                padding_mode=target.padding_mode,
-                device=target.weight.device,
-                dtype=target.weight.dtype,
-                adapter_name=adapter_name,
-                **kwargs,
-            )
-        elif isinstance(target, torch.nn.Linear):
-            new_module = new_module_cls(
-                target.in_features,
-                target.out_features,
-                bias=target.bias is not None,
-                device=target.weight.device,
-                dtype=target.weight.dtype,
-                adapter_name=adapter_name,
-                **kwargs,
-            )
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
         else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Conv2d):
+            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
+            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
+        else:
+            supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys())
             raise ValueError(
-                "Target module not found, currently only adapters for nn.Linear and nn.Conv2d are supported"
+                f"Target module of type {type(target)} not supported, "
+                f"currently only adapters for {supported_modules} are supported"
             )
 
         return new_module
@@ -305,12 +269,17 @@ def _replace_module(self, parent, child_name, new_module, child):
         setattr(parent, child_name, new_module)
         # It's not necessary to set requires_grad here, as that is handled by
         # _mark_only_adapters_as_trainable
-        new_module.weight = child.weight
-        if hasattr(child, "bias"):
-            new_module.bias = child.bias
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
 
         if getattr(child, "state", None) is not None:
-            new_module.state = child.state
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
             new_module.to(child.weight.device)
 
         # dispatch to correct device
@@ -324,47 +293,30 @@ def _set_adapter_layers(self, enabled=True):
                 module.enable_adapters(enabled)
 
     def _unload_and_optionally_merge(
-        self, merge=True, progressbar: bool = False, adapter_names: Optional[List[str]] = None
+        self,
+        merge: bool = True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[List[str]] = None,
     ):
         if merge:
             if getattr(self.model, "quantization_method", None) == "gptq":
                 raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
 
-        key_list = [key for key, _ in self.model.named_modules() if "hada" not in key]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
         desc = "Unloading " + ("and merging " if merge else "") + "model"
         for key in tqdm(key_list, disable=not progressbar, desc=desc):
             try:
                 parent, target, target_name = _get_submodules(self.model, key)
             except AttributeError:
                 continue
-            if isinstance(target, LycorisLayer):
-                if isinstance(target, nn.Conv2d):
-                    new_module = torch.nn.Conv2d(
-                        target.in_channels,
-                        target.out_channels,
-                        kernel_size=target.kernel_size,
-                        stride=target.stride,
-                        padding=target.padding,
-                        dilation=target.dilation,
-                    )
-                elif isinstance(target, nn.Linear):
-                    bias = target.bias is not None
-                    new_module = torch.nn.Linear(
-                        target.in_features,
-                        target.out_features,
-                        bias=bias,
-                        device=target.weight.device,
-                    )
-                else:
-                    raise ValueError(
-                        "Cannot convert current module to torch module, currently only adapters for nn.Linear and nn.Conv2d are supported"
-                    )
-                if merge:
-                    target.merge(adapter_names=adapter_names)
-                self._replace_module(parent, target_name, new_module, target)
 
-            # save any additional trainable modules part of `modules_to_save`
-            if isinstance(target, ModulesToSaveWrapper):
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
                 setattr(parent, target_name, target.modules_to_save[target.active_adapter])
 
         return self.model
@@ -375,8 +327,34 @@ def enable_adapter_layers(self):
     def disable_adapter_layers(self):
         self._set_adapter_layers(enabled=False)
 
-    def merge_and_unload(self, progressbar: bool = False, adapter_names: Optional[List[str]] = None):
-        return self._unload_and_optionally_merge(progressbar=progressbar, adapter_names=adapter_names)
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[List[str]] = None
+    ):
+        r"""
+        This method merges the adapter layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self):
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
 
     def set_adapter(self, adapter_name):
         for module in self.model.modules():
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
index 004352237f..d9616d29d6 100644
--- a/src/peft/tuners/tuners_utils.py
+++ b/src/peft/tuners/tuners_utils.py
@@ -20,6 +20,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Union
 
+import torch
 from torch import nn
 
 from peft.utils import COMMON_LAYERS_PATTERN
@@ -312,6 +313,34 @@ class BaseTunerLayer(ABC):
     # List all merged adapters
     merged_adapters: list[str] = []
 
+    def get_base_layer(self) -> nn.Module:
+        """
+        (Recursively) get the base_layer.
+
+        This is necessary for the case that the tuner layer wraps another tuner layer.
+
+        """
+        base_layer = self
+        while hasattr(base_layer, "base_layer"):
+            base_layer = base_layer.base_layer
+        return base_layer
+
+    @property
+    def weight(self) -> torch.Tensor:
+        # This is required for some transformers code, e.g. for T5, weight is accessed as:
+        #     self.wo.weight
+        # where "wo" is the adapter layer.
+        # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers
+        # /models/t5/modeling_t5.py#L292
+        base_layer = self.get_base_layer()
+        if hasattr(base_layer, "qweight"):
+            # QuantLinear
+            weight = base_layer.qweight
+        else:
+            # Other layers
+            weight = base_layer.weight
+        return weight
+
     def merge(self, *args) -> None:
         raise NotImplementedError
 
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 4f64fa4487..50f22a5523 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -277,8 +277,22 @@ def _set_trainable(model, adapter_name):
 
 
 def _set_adapter(model, adapter_name):
+    def check_adapter_name(adapter_name):
+        if isinstance(adapter_name, str):
+            return adapter_name
+
+        # adapter_name is a list of str
+        if len(adapter_name) > 1:
+            raise ValueError("Only one adapter can be set at a time for modules_to_save")
+        elif len(adapter_name) == 0:
+            raise ValueError("Please specify at least one adapter to set")
+        adapter_name = adapter_name[0]
+        return adapter_name
+
     for module in model.modules():
         if isinstance(module, ModulesToSaveWrapper):
+            # only check the adapter_name if we actually encounter a ModulesToSaveWrapper, otherwise we don't care
+            adapter_name = check_adapter_name(adapter_name)
             module.set_adapter(adapter_name)
 
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 14ae59b05c..347df218b2 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -466,6 +466,20 @@ def test_inference_safetensors(self, test_name, model_id, config_cls, config_kwa
     def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwargs):
         self._test_peft_model_device_map(model_id, config_cls, config_kwargs)
 
+    @parameterized.expand(TEST_CASES)
+    def test_forward_output_finite(self, test_name, model_id, config_cls, config_kwargs):
+        X = self.prepare_inputs_for_testing()
+        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        config = config_cls(
+            base_model_name_or_path=model_id,
+            **config_kwargs,
+        )
+        model = get_peft_model(model, config)
+        model.eval()
+        with torch.no_grad():
+            output = model(**X)
+        self.assertTrue(torch.isfinite(output).all())
+
     @parameterized.expand(TEST_CASES)
     def test_only_params_are_updated(self, test_name, model_id, config_cls, config_kwargs):
         # An explicit test that when using LoRA on a custom model, only the LoRA parameters are updated during training
@@ -546,7 +560,9 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
     @parameterized.expand(TEST_CASES)
     def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         X = self.prepare_inputs_for_testing()
-        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device).eval()
+        outputs_base = model(**X)
+
         config = config_cls(
             base_model_name_or_path=model_id,
             **config_kwargs,
@@ -555,6 +571,8 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         model.eval()
         outputs_before = model(**X)
 
+        self.assertTrue(torch.allclose(outputs_base, outputs_before))
+
         model.train()
         # EmbConv1D is slow to learn for some reason
         lr = 0.01 if model_id != "EmbConv1D" else 1.0
@@ -732,6 +750,67 @@ def test_non_existing_model_card(self):
         # rough check that the model card is pre-filled
         self.assertGreater(len(model_card), 1000)
 
+    @parameterized.expand(
+        [
+            LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+            LoKrConfig(target_modules=["lin0"], init_weights=False),
+            LoHaConfig(target_modules=["lin0"], init_weights=False),
+            AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+            IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False),
+        ]
+    )
+    def test_adapter_name_makes_no_difference(self, config0):
+        # It should not matter whether we use the default adapter name or a custom one
+        model_cls = MLP
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+
+        # base model
+        torch.manual_seed(0)
+        base_model = model_cls().eval().to(self.torch_device)
+        output_base = base_model(input)
+
+        # default name
+        torch.manual_seed(0)
+        base_model = model_cls().eval().to(self.torch_device)
+        torch.manual_seed(0)
+        peft_model_default = get_peft_model(base_model, config0, adapter_name="default").eval().to(self.torch_device)
+        output_default = peft_model_default(input)
+        sd_default = peft_model_default.state_dict()
+
+        # custom name 1
+        torch.manual_seed(0)
+        base_model = model_cls().eval().to(self.torch_device)
+        torch.manual_seed(0)
+        peft_model_custom1 = get_peft_model(base_model, config0, adapter_name="adapter").eval().to(self.torch_device)
+        output_custom1 = peft_model_custom1(input)
+        sd_custom1 = peft_model_custom1.state_dict()
+
+        # custom name 2
+        torch.manual_seed(0)
+        base_model = model_cls().eval().to(self.torch_device)
+        torch.manual_seed(0)
+        peft_model_custom2 = (
+            get_peft_model(base_model, config0, adapter_name="other-name").eval().to(self.torch_device)
+        )
+        output_custom2 = peft_model_custom2(input)
+        sd_custom2 = peft_model_custom2.state_dict()
+
+        assert len(sd_default) == len(sd_custom1) == len(sd_custom2)
+        for key in sd_default:
+            key1 = key.replace("default", "adapter")
+            key2 = key.replace("default", "other-name")
+            assert key1 in sd_custom1
+            assert key2 in sd_custom2
+        for k0, k1, k2 in zip(sd_default, sd_custom1, sd_custom2):
+            assert torch.allclose(sd_default[k0], sd_custom1[k1])
+            assert torch.allclose(sd_default[k0], sd_custom2[k2])
+
+        self.assertFalse(torch.allclose(output_base, output_default))
+        self.assertFalse(torch.allclose(output_base, output_custom1))
+        self.assertFalse(torch.allclose(output_base, output_custom2))
+        self.assertTrue(torch.allclose(output_custom1, output_custom2))
+        self.assertTrue(torch.allclose(output_default, output_custom1))
+
 
 class TestMultiRankAdapter(unittest.TestCase):
     """Tests related to multirank LoRA adapters"""
@@ -808,8 +887,9 @@ def test_repr_lora_linear(self):
         config = LoraConfig(target_modules=["lin0"])
         model = get_peft_model(MLP(), config)
         print_output = repr(model.model.lin0)
-        self.assertTrue(print_output.startswith("Linear"))
-        self.assertTrue("in_features=10, out_features=20" in print_output)
+        self.assertTrue(print_output.startswith("lora.Linear"))
+        self.assertTrue("in_features=10" in print_output)
+        self.assertTrue("out_features=20" in print_output)
         self.assertTrue("lora_A" in print_output)
         self.assertTrue("lora_B" in print_output)
         self.assertTrue("default" in print_output)
@@ -818,7 +898,7 @@ def test_repr_lora_embedding(self):
         config = LoraConfig(target_modules=["emb"])
         model = get_peft_model(ModelEmbConv1D(), config)
         print_output = repr(model.model.emb)
-        self.assertTrue(print_output.startswith("Embedding"))
+        self.assertTrue(print_output.startswith("lora.Embedding"))
         self.assertTrue("100, 5" in print_output)
         self.assertTrue("lora_embedding_A" in print_output)
         self.assertTrue("lora_embedding_B" in print_output)
@@ -828,8 +908,9 @@ def test_repr_lora_conv1d(self):
         config = LoraConfig(target_modules=["conv1d"])
         model = get_peft_model(ModelEmbConv1D(), config)
         print_output = repr(model.model.conv1d)
-        self.assertTrue(print_output.startswith("Linear"))
-        self.assertTrue("in_features=5, out_features=1" in print_output)
+        self.assertTrue(print_output.startswith("lora.Linear"))
+        self.assertTrue("in_features=5" in print_output)
+        self.assertTrue("out_features=1" in print_output)
         self.assertTrue("lora_A" in print_output)
         self.assertTrue("lora_B" in print_output)
         self.assertTrue("default" in print_output)
@@ -838,7 +919,7 @@ def test_repr_lora_conv2d(self):
         config = LoraConfig(target_modules=["conv2d"])
         model = get_peft_model(ModelConv2D(), config)
         print_output = repr(model.model.conv2d)
-        self.assertTrue(print_output.startswith("Conv2d"))
+        self.assertTrue(print_output.startswith("lora.Conv2d"))
         self.assertTrue("5, 10" in print_output)
         self.assertTrue("kernel_size=(3, 3)" in print_output)
         self.assertTrue("stride=(1, 1)" in print_output)
diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py
index a6b3d16d4d..ab49c3eea5 100644
--- a/tests/test_decoder_models.py
+++ b/tests/test_decoder_models.py
@@ -245,6 +245,7 @@ def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, co
                 "model_ids": PEFT_DECODER_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "adalora_kwargs": {"init_lora_weights": [False]},
+                "ia3_kwargs": {"init_ia3_weights": [False]},
                 "task_type": "CAUSAL_LM",
             },
             filter_params_func=skip_adalora_and_gpt2,
diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py
index ce09fc6247..2b4331ae20 100644
--- a/tests/test_feature_extraction_models.py
+++ b/tests/test_feature_extraction_models.py
@@ -156,6 +156,7 @@ def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_k
                 "model_ids": PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "adalora_kwargs": {"init_lora_weights": [False]},
+                "ia3_kwargs": {"init_ia3_weights": [False]},
                 "task_type": "FEATURE_EXTRACTION",
             },
         )
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 2c4a4f5b2b..e3a7040e1e 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -968,12 +968,12 @@ def _test_unload_adapter(self, model_id, config_cls, config_kwargs):
         model = get_peft_model(model, config)
         model = model.to(self.torch_device)
 
-        if config.peft_type not in ("LORA", "ADALORA"):
+        if config.peft_type not in ("LORA", "ADALORA", "IA3"):
             with self.assertRaises(AttributeError):
                 model = model.unload()
         else:
             dummy_input = self.prepare_inputs_for_testing()
-            logits_with_lora = model(**dummy_input)[0]
+            logits_with_adapter = model(**dummy_input)[0]
 
             transformers_model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
             logits_transformers = transformers_model(**dummy_input)[0]
@@ -982,7 +982,7 @@ def _test_unload_adapter(self, model_id, config_cls, config_kwargs):
             model = model.unload()
             logits_unload = model(**dummy_input)[0]
 
-            self.assertFalse(torch.allclose(logits_with_lora, logits_unload, atol=1e-10, rtol=1e-10))
+            self.assertFalse(torch.allclose(logits_with_adapter, logits_unload, atol=1e-10, rtol=1e-10))
             self.assertTrue(torch.allclose(logits_transformers, logits_unload, atol=1e-4, rtol=1e-4))
 
     def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs):
@@ -992,13 +992,14 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw
 
         adapter_list = ["adapter1", "adapter_2", "adapter_3"]
         weight_list = [0.5, 1.5, 1.5]
-        model = self.transformers_class.from_pretrained(model_id)
         config = config_cls(
             base_model_name_or_path=model_id,
             **config_kwargs,
         )
         if not isinstance(config, (LoraConfig)):
             return
+
+        model = self.transformers_class.from_pretrained(model_id)
         model = get_peft_model(model, config, adapter_list[0])
         model.add_adapter(adapter_list[1], config)
         model.add_adapter(adapter_list[2], replace(config, r=20))
@@ -1113,7 +1114,7 @@ def get_output(model):
         # must be False
         if isinstance(peft_model, StableDiffusionPipeline):
             # for SD, check that most pixels have different values
-            self.assertTrue((output_before != output_peft).float().mean() > 0.9)
+            self.assertTrue((output_before != output_peft).float().mean() > 0.8)
         else:
             self.assertFalse(torch.allclose(output_before, output_peft))
 

From 21df968fd12b5eaf6481496a44191c4f7c236462 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:43:36 +0100
Subject: [PATCH 39/65] [`Tests`] Fix daily CI (#1136)

* fix daily CI

* adapt from suggestion
---
 tests/test_gpu_examples.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index 9c4fad2132..103808e251 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -44,6 +44,7 @@
     prepare_model_for_int8_training,
     prepare_model_for_kbit_training,
 )
+from peft.utils import SAFETENSORS_WEIGHTS_NAME
 
 from .testing_utils import (
     require_auto_gptq,
@@ -177,7 +178,7 @@ def test_causal_lm_training(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -235,7 +236,7 @@ def test_causal_lm_training_4bit(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -296,7 +297,7 @@ def test_causal_lm_training_mutli_gpu_4bit(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -357,7 +358,7 @@ def test_4bit_adalora_causalLM(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -421,7 +422,7 @@ def test_causal_lm_training_mutli_gpu(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -481,7 +482,7 @@ def test_seq2seq_lm_training_single_gpu(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -542,7 +543,7 @@ def test_seq2seq_lm_training_mutli_gpu(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -640,7 +641,7 @@ def make_inputs_require_grad(module, input, output):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -719,7 +720,7 @@ def test_causal_lm_training(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -781,7 +782,7 @@ def test_adalora_causalLM(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
@@ -844,7 +845,7 @@ def test_causal_lm_training_mutli_gpu(self):
             model.cpu().save_pretrained(tmp_dir)
 
             self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
-            self.assertTrue("adapter_model.bin" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
 
             # assert loss is not None
             self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])

From 99e1a55f54fd9fa7c1d6ec3288f4295634615119 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:12:39 +0100
Subject: [PATCH 40/65] [`core` / `LoRA`] Add `adapter_names` in bnb layers
 (#1139)

* Update bnb.py

* fix style
---
 src/peft/tuners/lora/bnb.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/peft/tuners/lora/bnb.py b/src/peft/tuners/lora/bnb.py
index 1c42a9e8e3..dd672adfcc 100644
--- a/src/peft/tuners/lora/bnb.py
+++ b/src/peft/tuners/lora/bnb.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import warnings
+from typing import List, Optional
 
 import bitsandbytes as bnb
 import torch
@@ -43,7 +44,7 @@ def __init__(
 
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
 
-        def merge(self, safe_merge: bool = False):
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
             """
             Merge the active adapter weights into the base weights
 
@@ -52,6 +53,9 @@ def merge(self, safe_merge: bool = False):
                     If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                     before merging the weights. This is useful if you want to check if the merge operation will produce
                     NaNs. Defaults to `False`.
+                adapter_names (`List[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
             """
             if self.merged:
                 warnings.warn(
@@ -59,7 +63,10 @@ def merge(self, safe_merge: bool = False):
                     f"You are now additionally merging {','.join(self.active_adapters)}."
                 )
 
-            for active_adapter in self.active_adapters:
+            if adapter_names is None:
+                adapter_names = self.active_adapters
+
+            for active_adapter in adapter_names:
                 if active_adapter not in self.lora_A.keys():
                     continue
                 warnings.warn(
@@ -191,7 +198,7 @@ def __init__(
 
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
 
-        def merge(self, safe_merge: bool = False):
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
             """
             Merge the active adapter weights into the base weights
 
@@ -200,6 +207,9 @@ def merge(self, safe_merge: bool = False):
                     If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                     before merging the weights. This is useful if you want to check if the merge operation will produce
                     NaNs. Defaults to `False`.
+                adapter_names (`List[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
             """
             if self.merged:
                 warnings.warn(
@@ -207,7 +217,10 @@ def merge(self, safe_merge: bool = False):
                     f"You are now additionally merging {','.join(self.active_adapters)}."
                 )
 
-            for active_adapter in self.active_adapters:
+            if adapter_names is None:
+                adapter_names = self.active_adapters
+
+            for active_adapter in adapter_names:
                 if active_adapter not in self.lora_A.keys():
                     continue
                 warnings.warn(

From 18a09101132473200a68f5b7f2cbe00a6932c2bf Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:11:19 +0100
Subject: [PATCH 41/65] =?UTF-8?q?[`Tests`]=C2=A0Do=20not=20stop=20tests=20?=
 =?UTF-8?q?if=20a=20job=20failed=20(#1141)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update nightly.yml

* Update nightly.yml
---
 .github/workflows/nightly.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 86a6e271c1..4183b7bf4e 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -15,6 +15,8 @@ env:
 
 jobs:
   run_all_tests_single_gpu:
+    strategy:
+      fail-fast: false
     runs-on: [self-hosted, docker-gpu, multi-gpu]
     env:
       CUDA_VISIBLE_DEVICES: "0"
@@ -57,6 +59,8 @@ jobs:
           python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
 
   run_all_tests_multi_gpu:
+    strategy:
+      fail-fast: false
     runs-on: [self-hosted, docker-gpu, multi-gpu]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"

From 9cdaed27693ec202b6792349cf41f22b394c8f5e Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 17 Nov 2023 14:11:54 +0100
Subject: [PATCH 42/65] CI Add Python 3.11 to test matrix (#1143)

Only required change was to call .value on some enums when used in
messages, as their repr has changed in Python 3.11.
---
 .github/workflows/tests.yml             | 2 +-
 src/peft/tuners/p_tuning/model.py       | 2 +-
 src/peft/tuners/prompt_tuning/config.py | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 33c89bee09..a733aaf480 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -28,7 +28,7 @@ jobs:
     needs: check_code_quality
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/src/peft/tuners/p_tuning/model.py b/src/peft/tuners/p_tuning/model.py
index 204d505b68..04b6e2814a 100644
--- a/src/peft/tuners/p_tuning/model.py
+++ b/src/peft/tuners/p_tuning/model.py
@@ -104,7 +104,7 @@ def __init__(self, config):
                 encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers
                 if config.encoder_num_layers != encoder_num_layers_default:
                     warnings.warn(
-                        f"for {self.encoder_type}, the argument `encoder_num_layers` is ignored. "
+                        f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. "
                         f"Exactly {encoder_num_layers_default} MLP layers are used."
                     )
                 layers = [
diff --git a/src/peft/tuners/prompt_tuning/config.py b/src/peft/tuners/prompt_tuning/config.py
index 46df189673..9f15b1a881 100644
--- a/src/peft/tuners/prompt_tuning/config.py
+++ b/src/peft/tuners/prompt_tuning/config.py
@@ -73,4 +73,6 @@ def __post_init__(self):
         self.peft_type = PeftType.PROMPT_TUNING
 
         if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT):
-            raise ValueError(f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT}'.")
+            raise ValueError(
+                f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT.value}'."
+            )

From b5a8a294edd849ac09584a1250dcec8a85e89f18 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 17 Nov 2023 15:18:34 +0100
Subject: [PATCH 43/65] FIX A few issues with AdaLora, adding tests (#1146)

This PR fixes a handful of issues with AdaLora, should resolve #1113.

Description

1. lora_A.weight.device was called but for AdaLora, lora_A is a
   nn.Paramter, not an nn.Module, so the weight attribute does not
   exist. lora_A.device is sufficient.
2. For 8bit, an inplace operation failed because it was on a view. Now
   the operation is no longer inplace.
3. The loss term of the model output is not necessarily a torch tensor.
   In the test, it was a dict and did not contain an actual loss.
   Therefore, I added a check to make sure the loss is a torch tensor.
---
 src/peft/tuners/adalora/bnb.py   |  5 +-
 src/peft/tuners/adalora/model.py |  2 +-
 tests/test_gpu_examples.py       | 84 ++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/peft/tuners/adalora/bnb.py b/src/peft/tuners/adalora/bnb.py
index a37745569a..b5e2b8a1c4 100644
--- a/src/peft/tuners/adalora/bnb.py
+++ b/src/peft/tuners/adalora/bnb.py
@@ -70,7 +70,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 if requires_conversion:
                     output = output.to(expected_dtype)
                 output = output * scaling / ranknum
-                result += output
+                # inplace operation on view is forbidden for MatMul8bitLtBackward, so avoid it
+                result = result + output
             return result
 
         def __repr__(self) -> str:
@@ -127,7 +128,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 requires_conversion = not torch.is_autocast_enabled()
                 if requires_conversion:
                     expected_dtype = result.dtype
-                    compute_dtype = lora_A.weight.dtype
+                    compute_dtype = lora_A.dtype
                     if x.dtype != compute_dtype:
                         x = x.to(compute_dtype)
 
diff --git a/src/peft/tuners/adalora/model.py b/src/peft/tuners/adalora/model.py
index 71f2ed7579..7ccf13e8c9 100644
--- a/src/peft/tuners/adalora/model.py
+++ b/src/peft/tuners/adalora/model.py
@@ -236,7 +236,7 @@ def __getattr__(self, name: str):
     def forward(self, *args, **kwargs):
         outputs = self.model.forward(*args, **kwargs)
 
-        if getattr(outputs, "loss", None) is not None:
+        if (getattr(outputs, "loss", None) is not None) and isinstance(outputs.loss, torch.Tensor):
             # Calculate the orthogonal regularization
             orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight
 
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
index 103808e251..1af1919ad3 100644
--- a/tests/test_gpu_examples.py
+++ b/tests/test_gpu_examples.py
@@ -125,6 +125,14 @@ def tearDown(self):
             torch.cuda.empty_cache()
         gc.collect()
 
+    def _check_inference_finite(self, model, batch):
+        # try inference without Trainer class
+        training = model.training
+        model.eval()
+        output = model(**batch.to(model.device))
+        self.assertTrue(torch.isfinite(output.logits).all())
+        model.train(training)
+
     @pytest.mark.single_gpu_tests
     def test_causal_lm_training(self):
         r"""
@@ -335,6 +343,71 @@ def test_4bit_adalora_causalLM(self):
 
         data = load_dataset("ybelkada/english_quotes_copy")
         data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+        batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True)
+        self._check_inference_finite(model, batch)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = Trainer(
+                model=model,
+                train_dataset=data["train"],
+                args=TrainingArguments(
+                    per_device_train_batch_size=4,
+                    gradient_accumulation_steps=4,
+                    warmup_steps=2,
+                    max_steps=3,
+                    learning_rate=2e-4,
+                    fp16=True,
+                    logging_steps=1,
+                    output_dir=tmp_dir,
+                ),
+                data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+            )
+            model.config.use_cache = False
+            trainer.train()
+
+            model.cpu().save_pretrained(tmp_dir)
+
+            self.assertTrue("adapter_config.json" in os.listdir(tmp_dir))
+            self.assertTrue(SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir))
+
+            # assert loss is not None
+            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+
+    @pytest.mark.single_gpu_tests
+    @require_torch_gpu
+    def test_8bit_adalora_causalLM(self):
+        r"""
+        Tests the 8bit training with adalora
+        """
+        model_id = "facebook/opt-350m"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        model.gradient_checkpointing_enable()
+        model = prepare_model_for_kbit_training(model)
+
+        peft_config = AdaLoraConfig(
+            init_r=6,
+            target_r=4,
+            tinit=50,
+            tfinal=100,
+            deltaT=5,
+            beta1=0.3,
+            beta2=0.3,
+            orth_reg_weight=0.2,
+            lora_alpha=32,
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+
+        model = get_peft_model(model, peft_config)
+
+        data = load_dataset("ybelkada/english_quotes_copy")
+        data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+        batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True)
+        self._check_inference_finite(model, batch)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = Trainer(
@@ -671,6 +744,14 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
+    def _check_inference_finite(self, model, batch):
+        # try inference without Trainer class
+        training = model.training
+        model.eval()
+        output = model(**batch.to(model.device))
+        self.assertTrue(torch.isfinite(output.logits).all())
+        model.train(training)
+
     @pytest.mark.single_gpu_tests
     def test_causal_lm_training(self):
         r"""
@@ -738,6 +819,7 @@ def test_adalora_causalLM(self):
             quantization_config=self.quantization_config,
         )
 
+        tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id)
         model = prepare_model_for_kbit_training(model)
 
         peft_config = AdaLoraConfig(
@@ -759,6 +841,8 @@ def test_adalora_causalLM(self):
 
         data = load_dataset("ybelkada/english_quotes_copy")
         data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True)
+        batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True)
+        self._check_inference_finite(model, batch)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = Trainer(

From f1ecfa6ae6eba599ae89decbf47b339d8c9d39a3 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Fri, 17 Nov 2023 15:48:02 +0100
Subject: [PATCH 44/65] Use `huggingface_hub.file_exists` instead of custom
 helper (#1145)

* Use 'huggingface_hub.file_exists' instead of custom helper

* make quality
---
 setup.py                        |  1 +
 src/peft/utils/__init__.py      |  1 -
 src/peft/utils/hub_utils.py     | 29 -----------------------------
 src/peft/utils/save_and_load.py |  9 ++++-----
 4 files changed, 5 insertions(+), 35 deletions(-)
 delete mode 100644 src/peft/utils/hub_utils.py

diff --git a/setup.py b/setup.py
index 975c9c36f8..13aa58e708 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
         "tqdm",
         "accelerate>=0.21.0",
         "safetensors",
+        "huggingface_hub>=0.17.0",
     ],
     extras_require=extras,
     classifiers=[
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
index 1ba150eb02..7cb25e75aa 100644
--- a/src/peft/utils/__init__.py
+++ b/src/peft/utils/__init__.py
@@ -47,5 +47,4 @@
     get_quantization_config,
     id_tensor_storage,
 )
-from .hub_utils import hub_file_exists
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/src/peft/utils/hub_utils.py b/src/peft/utils/hub_utils.py
deleted file mode 100644
index 625f4e490d..0000000000
--- a/src/peft/utils/hub_utils.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# coding=utf-8
-# Copyright 2023-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from huggingface_hub import get_hf_file_metadata, hf_hub_url
-from huggingface_hub.utils import EntryNotFoundError
-
-
-def hub_file_exists(repo_id: str, filename: str, revision: str = None, repo_type: str = None) -> bool:
-    r"""
-    Checks if a file exists in a remote Hub repository.
-    """
-    url = hf_hub_url(repo_id=repo_id, filename=filename, repo_type=repo_type, revision=revision)
-    try:
-        get_hf_file_metadata(url)
-        return True
-    except EntryNotFoundError:
-        return False
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index cd8088e93e..75e934747e 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -16,11 +16,10 @@
 from typing import Optional
 
 import torch
-from huggingface_hub import hf_hub_download
+from huggingface_hub import file_exists, hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError
 from safetensors.torch import load_file as safe_load_file
 
-from .hub_utils import hub_file_exists
 from .other import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
 from .peft_types import PeftType
 
@@ -194,9 +193,9 @@ def load_peft_weights(model_id: str, device: Optional[str] = None, **hf_hub_down
         filename = os.path.join(path, WEIGHTS_NAME)
         use_safetensors = False
     else:
-        has_remote_safetensors_file = hub_file_exists(
-            model_id,
-            SAFETENSORS_WEIGHTS_NAME,
+        has_remote_safetensors_file = file_exists(
+            repo_id=model_id,
+            filename=SAFETENSORS_WEIGHTS_NAME,
             revision=hf_hub_download_kwargs.get("revision", None),
             repo_type=hf_hub_download_kwargs.get("repo_type", None),
         )

From 8351331d78494a8fd6c9e0772fea95d4d04f6de6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?=
 <alexrs95@gmail.com>
Date: Mon, 20 Nov 2023 18:22:52 +0100
Subject: [PATCH 45/65] ENH Delete IA3 adapters (#1153)

---
 src/peft/tuners/ia3/layer.py |  5 +----
 src/peft/tuners/ia3/model.py | 33 ++++++++++++++++++++++++++++-----
 tests/testing_common.py      | 10 +++++-----
 3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
index 45ef388399..e682c2bdd5 100644
--- a/src/peft/tuners/ia3/layer.py
+++ b/src/peft/tuners/ia3/layer.py
@@ -27,12 +27,9 @@
 class IA3Layer(BaseTunerLayer):
     # All names of layers that may contain adapter weights
     adapter_layer_names = ("ia3_l",)
-    # All names of other parameters that may contain adapter-related parameters
-    other_layer_names = ("scaling",)
 
     def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None:
         self.base_layer = base_layer
-        self.scaling = {}
         self.ia3_l = nn.ParameterDict({})
         # Mark the weight as unmerged
         self._disable_adapters = False
@@ -294,7 +291,7 @@ def unmerge(self) -> None:
                     base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
 
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
-        previous_dtype = x.dtype
+        dtype = previous_dtype = x.dtype
 
         if self.disable_adapters:
             if self.merged:
diff --git a/src/peft/tuners/ia3/model.py b/src/peft/tuners/ia3/model.py
index 7b2f9d19d9..725faf084b 100644
--- a/src/peft/tuners/ia3/model.py
+++ b/src/peft/tuners/ia3/model.py
@@ -78,6 +78,8 @@ class IA3Model(BaseTuner):
         - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model.
     """
 
+    prefix: str = "ia3_"
+
     def __init__(self, model, config, adapter_name):
         super().__init__(model, config, adapter_name)
 
@@ -146,7 +148,7 @@ def _check_target_module_exists(ia3_config, key):
 
     def _mark_only_adapters_as_trainable(self) -> None:
         for n, p in self.model.named_parameters():
-            if "ia3_" not in n:
+            if self.prefix not in n:
                 p.requires_grad = False
 
     def _create_and_replace(
@@ -202,8 +204,7 @@ def _check_target_module_feedforward(ia3_config, key) -> bool:
             is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
         return is_feedforward
 
-    @staticmethod
-    def _replace_module(parent, child_name, new_module, child):
+    def _replace_module(self, parent, child_name, new_module, child):
         setattr(parent, child_name, new_module)
 
         # child layer wraps the original module, unpack it
@@ -225,7 +226,7 @@ def _replace_module(parent, child_name, new_module, child):
 
         # dispatch to correct device
         for name, module in new_module.named_modules():
-            if "ia3_" in name:
+            if self.prefix in name:
                 module.to(child.weight.device)
 
     def __getattr__(self, name: str):
@@ -298,7 +299,7 @@ def _unload_and_optionally_merge(
         if getattr(self.model, "is_loaded_in_4bit", False):
             raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode")
 
-        key_list = [key for key, _ in self.model.named_modules() if "ia3" not in key]
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
         for key in key_list:
             try:
                 parent, target, target_name = _get_submodules(self.model, key)
@@ -348,3 +349,25 @@ def unload(self):
         model.
         """
         return self._unload_and_optionally_merge(merge=False)
+
+    def delete_adapter(self, adapter_name: str):
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in self.peft_config:
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, IA3Layer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
diff --git a/tests/testing_common.py b/tests/testing_common.py
index e3a7040e1e..17b9f147c2 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -885,7 +885,7 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar
             self.assertIsNotNone(param.grad)
 
     def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(
@@ -905,7 +905,7 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
         self.assertFalse(adapter_to_delete in model.peft_config)
         self.assertEqual(model.active_adapters, ["default"])
 
-        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in model.named_modules()]
         for key in key_list:
             _, target, _ = _get_submodules(model, key)
             attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr(target, "other_param_names", [])
@@ -923,7 +923,7 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
 
     def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
         # same as test_delete_adapter, but this time an inactive adapter is deleted
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(
@@ -943,7 +943,7 @@ def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
         self.assertFalse(adapter_to_delete in model.peft_config)
         self.assertEqual(model.active_adapters, ["default"])
 
-        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in model.named_modules()]
         for key in key_list:
             _, target, _ = _get_submodules(model, key)
             attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr(target, "other_param_names", [])
@@ -1038,7 +1038,7 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw
         for new_adapter in new_adapters:
             self.assertTrue(new_adapter in model.peft_config)
 
-        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        key_list = [key for key, _ in model.named_modules()]
         for key in key_list:
             _, target, _ = _get_submodules(model, key)
             if isinstance(target, LoraLayer):

From 0ae52fece17a3514116815984444116b75d9c5ca Mon Sep 17 00:00:00 2001
From: Mishig <mishig.davaadorj@coloradocollege.edu>
Date: Tue, 21 Nov 2023 01:57:56 -0800
Subject: [PATCH 46/65] [Docs fix] Relative path issue (#1157)

---
 docs/source/developer_guides/custom_models.mdx | 2 +-
 docs/source/developer_guides/low_level_api.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer_guides/custom_models.mdx b/docs/source/developer_guides/custom_models.mdx
index 014534d337..08250aee56 100644
--- a/docs/source/developer_guides/custom_models.mdx
+++ b/docs/source/developer_guides/custom_models.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 Some fine-tuning techniques, such as prompt tuning, are specific to language models. That means in 🤗 PEFT, it is
 assumed a 🤗 Transformers model is being used. However, other fine-tuning techniques - like
-[LoRA](./conceptual_guides/lora) - are not restricted to specific model types.
+[LoRA](../conceptual_guides/lora) - are not restricted to specific model types.
 
 In this guide, we will see how LoRA can be applied to a multilayer perceptron and a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library.
 
diff --git a/docs/source/developer_guides/low_level_api.mdx b/docs/source/developer_guides/low_level_api.mdx
index e2a17df201..55e1897887 100644
--- a/docs/source/developer_guides/low_level_api.mdx
+++ b/docs/source/developer_guides/low_level_api.mdx
@@ -17,7 +17,7 @@ The development of this API has been motivated by the need for super users to no
 
 ## Supported tuner types
 
-Currently the supported adapter types are the 'injectable' adapters, meaning adapters where an inplace modification of the model is sufficient to correctly perform the fine tuning. As such, only [LoRA](./conceptual_guides/lora), AdaLoRA and [IA3](./conceptual_guides/ia3) are currently supported in this API.
+Currently the supported adapter types are the 'injectable' adapters, meaning adapters where an inplace modification of the model is sufficient to correctly perform the fine tuning. As such, only [LoRA](../conceptual_guides/lora), AdaLoRA and [IA3](../conceptual_guides/ia3) are currently supported in this API.
 
 ## `inject_adapter_in_model` method 
 

From b4ac2d840b4e0ea15d2964a940c71b6d55bec006 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Wed, 22 Nov 2023 12:23:50 +0100
Subject: [PATCH 47/65] FIX Dataset loaded twice in 4-bit finetuning script
 (#1164)

---
 examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
index f0fc29d8e1..018cc53b05 100755
--- a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+++ b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
@@ -131,11 +131,6 @@ def print_trainable_parameters(model):
 
 """### Training"""
 
-
-data = load_dataset("Abirate/english_quotes")
-data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
-
-
 data = load_dataset("Abirate/english_quotes")
 data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 

From 043238578f9c7af88334005819543cf60e263760 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 17:44:21 +0530
Subject: [PATCH 48/65] fix `add_weighted_adapter` method (#1169)

* fix `add_weighted_adapter` method

Co-Authored-By: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Co-Authored-By: jihuishan <151612440+jihuishan@users.noreply.github.com>

* Update testing_common.py

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Co-authored-by: jihuishan <151612440+jihuishan@users.noreply.github.com>
---
 src/peft/tuners/lora/model.py |  5 +++--
 tests/testing_common.py       | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index a5b7735ce3..653a684276 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import operator
 import re
 import warnings
@@ -517,8 +518,8 @@ def add_weighted_adapter(
                             current_adapter_lora_B = target.lora_embedding_B[adapter]
                         else:
                             continue
-                        target_lora_A.data += current_adapter_lora_A.data * weight * target.scaling[adapter]
-                        target_lora_B.data += current_adapter_lora_B.data
+                        target_lora_A.data += current_adapter_lora_A.data * math.sqrt(weight) * target.scaling[adapter]
+                        target_lora_B.data += current_adapter_lora_B.data * math.sqrt(weight)
                 elif combination_type == "cat":
                     loras_A, loras_B = [], []
                     for adapter, weight in zip(adapters, weights):
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 17b9f147c2..00809c2bc1 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -163,6 +163,7 @@ class PeftCommonTester:
         transformers_class (`transformers.PreTrainedModel`):
             The transformers class that is being tested.
     """
+
     torch_device = infer_device()
     transformers_class = None
 
@@ -1021,6 +1022,14 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw
             adapter_list[:2], weight_list[:2], "multi_adapter_linear_reweighting", combination_type="linear"
         )
 
+        # test linear re-weighting with multiple adapters with only first adapter having non zero weight
+        model.add_weighted_adapter(
+            adapter_list[:2],
+            [weight_list[0], 0],
+            "multi_adapter_linear_reweighting_single_enabled",
+            combination_type="linear",
+        )
+
         with self.assertRaises(ValueError):
             model.add_weighted_adapter(
                 adapter_list[1:],
@@ -1034,6 +1043,7 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw
             "multi_adapter_svd_reweighting",
             "multi_adapter_cat_reweighting",
             "multi_adapter_linear_reweighting",
+            "multi_adapter_linear_reweighting_single_enabled",
         ]
         for new_adapter in new_adapters:
             self.assertTrue(new_adapter in model.peft_config)

From dd4771b2f42a72b9d7a54fbf9815530659305718 Mon Sep 17 00:00:00 2001
From: Costa Huang <costa.huang@outlook.com>
Date: Wed, 22 Nov 2023 14:52:26 -0500
Subject: [PATCH 49/65] (minor) correct type annotation (#1166)

* add correct type annotation

* make style
---
 src/peft/config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/peft/config.py b/src/peft/config.py
index 63f4ad6eea..1916a6a3c9 100644
--- a/src/peft/config.py
+++ b/src/peft/config.py
@@ -213,10 +213,12 @@ class PeftConfig(PeftConfigMixin):
         inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
     """
 
-    base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
-    revision: str = field(default=None, metadata={"help": "The specific model version to use."})
-    peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
-    task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name of the base model to use."}
+    )
+    revision: Optional[str] = field(default=None, metadata={"help": "The specific model version to use."})
+    peft_type: Optional[Union[str, PeftType]] = field(default=None, metadata={"help": "Peft type"})
+    task_type: Optional[Union[str, TaskType]] = field(default=None, metadata={"help": "Task type"})
     inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
 
 

From a634f6a13e1b0b55a78b78b39bbc6fde425f58a5 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 23 Nov 2023 10:35:53 +0100
Subject: [PATCH 50/65] Update release checklist about release notes (#1170)

Add a reminder in the release checklist to consult the release note
google doc.
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 13aa58e708..8e3d60ec7c 100644
--- a/setup.py
+++ b/setup.py
@@ -83,4 +83,5 @@
 # 8. Upload the final version to actual pypi:
 #      twine upload dist/* -r pypi
 # 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
+#      Check the notes here: https://docs.google.com/document/d/1k-sOIfykuKjWcOIALqjhFKz4amFEp-myeJUJEzNgjoU/edit?usp=sharing
 # 10. Update the version in __init__.py, setup.py to the bumped minor version + ".dev0" (e.g. from "0.6.0" to "0.7.0.dev0")

From fb607d00adc7b8aa1694f58a786ce2cd5681710d Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 23 Nov 2023 02:38:57 -0800
Subject: [PATCH 51/65] DOC convert mdx to md (#1171)

Content can still technically be mdx but mdx is not rendered well on
GitHub, so this makes reviewing doc files easier.
---
 ...eepspeed-zero3-offload.mdx => deepspeed-zero3-offload.md} | 4 ++++
 docs/source/accelerate/{fsdp.mdx => fsdp.md}                 | 4 ++++
 docs/source/conceptual_guides/{ia3.mdx => ia3.md}            | 4 ++++
 docs/source/conceptual_guides/{lora.mdx => lora.md}          | 4 ++++
 .../source/conceptual_guides/{prompting.mdx => prompting.md} | 5 +++++
 .../developer_guides/{contributing.mdx => contributing.md}   | 4 ++++
 .../developer_guides/{custom_models.mdx => custom_models.md} | 4 ++++
 .../developer_guides/{low_level_api.mdx => low_level_api.md} | 4 ++++
 .../{troubleshooting.mdx => troubleshooting.md}              | 4 ++++
 docs/source/{index.mdx => index.md}                          | 4 ++++
 docs/source/{install.mdx => install.md}                      | 4 ++++
 docs/source/package_reference/{config.mdx => config.md}      | 4 ++++
 .../package_reference/{peft_model.mdx => peft_model.md}      | 4 ++++
 docs/source/package_reference/{tuners.mdx => tuners.md}      | 4 ++++
 docs/source/{quicktour.mdx => quicktour.md}                  | 4 ++++
 .../{clm-prompt-tuning.mdx => clm-prompt-tuning.md}          | 4 ++++
 .../task_guides/{dreambooth_lora.mdx => dreambooth_lora.md}  | 4 ++++
 ..._classification_lora.mdx => image_classification_lora.md} | 4 ++++
 docs/source/task_guides/{int8-asr.mdx => int8-asr.md}        | 4 ++++
 ...-seq-classification.mdx => ptuning-seq-classification.md} | 4 ++++
 docs/source/task_guides/semantic-similarity-lora.md          | 4 ++++
 ...c_segmentation_lora.mdx => semantic_segmentation_lora.md} | 4 ++++
 .../{seq2seq-prefix-tuning.mdx => seq2seq-prefix-tuning.md}  | 4 ++++
 ...-classification-lora.mdx => token-classification-lora.md} | 4 ++++
 24 files changed, 97 insertions(+)
 rename docs/source/accelerate/{deepspeed-zero3-offload.mdx => deepspeed-zero3-offload.md} (98%)
 rename docs/source/accelerate/{fsdp.mdx => fsdp.md} (97%)
 rename docs/source/conceptual_guides/{ia3.mdx => ia3.md} (95%)
 rename docs/source/conceptual_guides/{lora.mdx => lora.md} (97%)
 rename docs/source/conceptual_guides/{prompting.mdx => prompting.md} (96%)
 rename docs/source/developer_guides/{contributing.mdx => contributing.md} (97%)
 rename docs/source/developer_guides/{custom_models.mdx => custom_models.md} (97%)
 rename docs/source/developer_guides/{low_level_api.mdx => low_level_api.md} (96%)
 rename docs/source/developer_guides/{troubleshooting.mdx => troubleshooting.md} (96%)
 rename docs/source/{index.mdx => index.md} (98%)
 rename docs/source/{install.mdx => install.md} (89%)
 rename docs/source/package_reference/{config.mdx => config.md} (77%)
 rename docs/source/package_reference/{peft_model.mdx => peft_model.md} (86%)
 rename docs/source/package_reference/{tuners.mdx => tuners.md} (78%)
 rename docs/source/{quicktour.mdx => quicktour.md} (97%)
 rename docs/source/task_guides/{clm-prompt-tuning.mdx => clm-prompt-tuning.md} (98%)
 rename docs/source/task_guides/{dreambooth_lora.mdx => dreambooth_lora.md} (98%)
 rename docs/source/task_guides/{image_classification_lora.mdx => image_classification_lora.md} (98%)
 rename docs/source/task_guides/{int8-asr.mdx => int8-asr.md} (98%)
 rename docs/source/task_guides/{ptuning-seq-classification.mdx => ptuning-seq-classification.md} (98%)
 rename docs/source/task_guides/{semantic_segmentation_lora.mdx => semantic_segmentation_lora.md} (99%)
 rename docs/source/task_guides/{seq2seq-prefix-tuning.mdx => seq2seq-prefix-tuning.md} (98%)
 rename docs/source/task_guides/{token-classification-lora.mdx => token-classification-lora.md} (98%)

diff --git a/docs/source/accelerate/deepspeed-zero3-offload.mdx b/docs/source/accelerate/deepspeed-zero3-offload.md
similarity index 98%
rename from docs/source/accelerate/deepspeed-zero3-offload.mdx
rename to docs/source/accelerate/deepspeed-zero3-offload.md
index b4ba130fec..9324e383a7 100644
--- a/docs/source/accelerate/deepspeed-zero3-offload.mdx
+++ b/docs/source/accelerate/deepspeed-zero3-offload.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # DeepSpeed
 
 [DeepSpeed](https://www.deepspeed.ai/) is a library designed for speed and scale for distributed training of large models with billions of parameters. At its core is the Zero Redundancy Optimizer (ZeRO) that shards optimizer states (ZeRO-1), gradients (ZeRO-2), and parameters (ZeRO-3) across data parallel processes. This drastically reduces memory usage, allowing you to scale your training to billion parameter models. To unlock even more memory efficiency, ZeRO-Offload reduces GPU compute and memory by leveraging CPU resources during optimization.
diff --git a/docs/source/accelerate/fsdp.mdx b/docs/source/accelerate/fsdp.md
similarity index 97%
rename from docs/source/accelerate/fsdp.mdx
rename to docs/source/accelerate/fsdp.md
index 59dd4afcec..473dd655b9 100644
--- a/docs/source/accelerate/fsdp.mdx
+++ b/docs/source/accelerate/fsdp.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Fully Sharded Data Parallel
 
 [Fully sharded data parallel](https://pytorch.org/docs/stable/fsdp.html) (FSDP) is developed for distributed training of large pretrained models up to 1T parameters. FSDP achieves this by sharding the model parameters, gradients, and optimizer states across data parallel processes and it can also offload sharded model parameters to a CPU. The memory efficiency afforded by FSDP allows you to scale training to larger batch or model sizes.
diff --git a/docs/source/conceptual_guides/ia3.mdx b/docs/source/conceptual_guides/ia3.md
similarity index 95%
rename from docs/source/conceptual_guides/ia3.mdx
rename to docs/source/conceptual_guides/ia3.md
index cb04d4818e..62364898d8 100644
--- a/docs/source/conceptual_guides/ia3.mdx
+++ b/docs/source/conceptual_guides/ia3.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # IA3 
diff --git a/docs/source/conceptual_guides/lora.mdx b/docs/source/conceptual_guides/lora.md
similarity index 97%
rename from docs/source/conceptual_guides/lora.mdx
rename to docs/source/conceptual_guides/lora.md
index 4f3027241c..67e16edda0 100644
--- a/docs/source/conceptual_guides/lora.mdx
+++ b/docs/source/conceptual_guides/lora.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # LoRA 
diff --git a/docs/source/conceptual_guides/prompting.mdx b/docs/source/conceptual_guides/prompting.md
similarity index 96%
rename from docs/source/conceptual_guides/prompting.mdx
rename to docs/source/conceptual_guides/prompting.md
index 1064a9affc..774d7f43e6 100644
--- a/docs/source/conceptual_guides/prompting.mdx
+++ b/docs/source/conceptual_guides/prompting.md
@@ -1,3 +1,8 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+
 # Prompting
 
 Training large pretrained language models is very time-consuming and compute-intensive. As they continue to grow in size, there is increasing interest in more efficient training methods such as *prompting*. Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters.
diff --git a/docs/source/developer_guides/contributing.mdx b/docs/source/developer_guides/contributing.md
similarity index 97%
rename from docs/source/developer_guides/contributing.mdx
rename to docs/source/developer_guides/contributing.md
index faacb6bc62..9a3b93af63 100644
--- a/docs/source/developer_guides/contributing.mdx
+++ b/docs/source/developer_guides/contributing.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Contributing to PEFT
diff --git a/docs/source/developer_guides/custom_models.mdx b/docs/source/developer_guides/custom_models.md
similarity index 97%
rename from docs/source/developer_guides/custom_models.mdx
rename to docs/source/developer_guides/custom_models.md
index 08250aee56..c337885819 100644
--- a/docs/source/developer_guides/custom_models.mdx
+++ b/docs/source/developer_guides/custom_models.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Working with custom models
diff --git a/docs/source/developer_guides/low_level_api.mdx b/docs/source/developer_guides/low_level_api.md
similarity index 96%
rename from docs/source/developer_guides/low_level_api.mdx
rename to docs/source/developer_guides/low_level_api.md
index 55e1897887..4d73fa82f0 100644
--- a/docs/source/developer_guides/low_level_api.mdx
+++ b/docs/source/developer_guides/low_level_api.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PEFT as a utility library
diff --git a/docs/source/developer_guides/troubleshooting.mdx b/docs/source/developer_guides/troubleshooting.md
similarity index 96%
rename from docs/source/developer_guides/troubleshooting.mdx
rename to docs/source/developer_guides/troubleshooting.md
index dfa00a08a6..de4994bc05 100644
--- a/docs/source/developer_guides/troubleshooting.mdx
+++ b/docs/source/developer_guides/troubleshooting.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Troubleshooting
diff --git a/docs/source/index.mdx b/docs/source/index.md
similarity index 98%
rename from docs/source/index.mdx
rename to docs/source/index.md
index 18c36822b8..5faf706e50 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # PEFT
diff --git a/docs/source/install.mdx b/docs/source/install.md
similarity index 89%
rename from docs/source/install.mdx
rename to docs/source/install.md
index 35a3c3e6a1..c1f435a5ef 100644
--- a/docs/source/install.mdx
+++ b/docs/source/install.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Installation
diff --git a/docs/source/package_reference/config.mdx b/docs/source/package_reference/config.md
similarity index 77%
rename from docs/source/package_reference/config.mdx
rename to docs/source/package_reference/config.md
index 47f5a00707..075ce906f6 100644
--- a/docs/source/package_reference/config.mdx
+++ b/docs/source/package_reference/config.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Configuration
 
 The configuration classes stores the configuration of a [`PeftModel`], PEFT adapter models, and the configurations of [`PrefixTuning`], [`PromptTuning`], and [`PromptEncoder`]. They contain methods for saving and loading model configurations from the Hub, specifying the PEFT method to use, type of task to perform, and model configurations like number of layers and number of attention heads.
diff --git a/docs/source/package_reference/peft_model.mdx b/docs/source/package_reference/peft_model.md
similarity index 86%
rename from docs/source/package_reference/peft_model.mdx
rename to docs/source/package_reference/peft_model.md
index 57dd10e872..a7bbcda9da 100644
--- a/docs/source/package_reference/peft_model.mdx
+++ b/docs/source/package_reference/peft_model.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Models
 
 [`PeftModel`] is the base model class for specifying the base Transformer model and configuration to apply a PEFT method to. The base `PeftModel` contains methods for loading and saving models from the Hub, and supports the [`PromptEncoder`] for prompt learning.
diff --git a/docs/source/package_reference/tuners.mdx b/docs/source/package_reference/tuners.md
similarity index 78%
rename from docs/source/package_reference/tuners.mdx
rename to docs/source/package_reference/tuners.md
index 1f9df3dd7f..a4b7305864 100644
--- a/docs/source/package_reference/tuners.mdx
+++ b/docs/source/package_reference/tuners.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Tuners
 
 Each tuner (or PEFT method) has a configuration and model.
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.md
similarity index 97%
rename from docs/source/quicktour.mdx
rename to docs/source/quicktour.md
index d9216de40d..346fc2de0d 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Quicktour
diff --git a/docs/source/task_guides/clm-prompt-tuning.mdx b/docs/source/task_guides/clm-prompt-tuning.md
similarity index 98%
rename from docs/source/task_guides/clm-prompt-tuning.mdx
rename to docs/source/task_guides/clm-prompt-tuning.md
index 54315c4ce1..835893c139 100644
--- a/docs/source/task_guides/clm-prompt-tuning.mdx
+++ b/docs/source/task_guides/clm-prompt-tuning.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Prompt tuning for causal language modeling
diff --git a/docs/source/task_guides/dreambooth_lora.mdx b/docs/source/task_guides/dreambooth_lora.md
similarity index 98%
rename from docs/source/task_guides/dreambooth_lora.mdx
rename to docs/source/task_guides/dreambooth_lora.md
index fe734b304b..3798a2e8d8 100644
--- a/docs/source/task_guides/dreambooth_lora.mdx
+++ b/docs/source/task_guides/dreambooth_lora.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # DreamBooth fine-tuning with LoRA
diff --git a/docs/source/task_guides/image_classification_lora.mdx b/docs/source/task_guides/image_classification_lora.md
similarity index 98%
rename from docs/source/task_guides/image_classification_lora.mdx
rename to docs/source/task_guides/image_classification_lora.md
index 1c4ea77ce1..4ff605eff1 100644
--- a/docs/source/task_guides/image_classification_lora.mdx
+++ b/docs/source/task_guides/image_classification_lora.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Image classification using LoRA
diff --git a/docs/source/task_guides/int8-asr.mdx b/docs/source/task_guides/int8-asr.md
similarity index 98%
rename from docs/source/task_guides/int8-asr.mdx
rename to docs/source/task_guides/int8-asr.md
index 37d63b6d6d..a4d5318770 100644
--- a/docs/source/task_guides/int8-asr.mdx
+++ b/docs/source/task_guides/int8-asr.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # int8 training for automatic speech recognition
 
 Quantization reduces the precision of floating point data types, decreasing the memory required to store model weights. However, quantization degrades inference performance because you lose information when you reduce the precision. 8-bit or `int8` quantization uses only a quarter precision, but it does not degrade performance because it doesn't just drop the bits or data. Instead, `int8` quantization *rounds* from one data type to another.
diff --git a/docs/source/task_guides/ptuning-seq-classification.mdx b/docs/source/task_guides/ptuning-seq-classification.md
similarity index 98%
rename from docs/source/task_guides/ptuning-seq-classification.mdx
rename to docs/source/task_guides/ptuning-seq-classification.md
index 5fe294e5fa..13fb69f2e4 100644
--- a/docs/source/task_guides/ptuning-seq-classification.mdx
+++ b/docs/source/task_guides/ptuning-seq-classification.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # P-tuning for sequence classification
 
 It is challenging to finetune large language models for downstream tasks because they have so many parameters. To work around this, you can use *prompts* to steer the model toward a particular downstream task without fully finetuning a model. Typically, these prompts are handcrafted, which may be impractical because you need very large validation sets to find the best prompts. *P-tuning* is a method for automatically searching and optimizing for better prompts in a continuous space.
diff --git a/docs/source/task_guides/semantic-similarity-lora.md b/docs/source/task_guides/semantic-similarity-lora.md
index bd7c165971..232f939603 100644
--- a/docs/source/task_guides/semantic-similarity-lora.md
+++ b/docs/source/task_guides/semantic-similarity-lora.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # LoRA for semantic similarity tasks
 
 Low-Rank Adaptation (LoRA) is a reparametrization method that aims to reduce the number of trainable parameters with low-rank representations. The weight matrix is broken down into low-rank matrices that are trained and updated. All the pretrained model parameters remain frozen. After training, the low-rank matrices are added back to the original weights. This makes it more efficient to store and train a LoRA model because there are significantly fewer parameters.
diff --git a/docs/source/task_guides/semantic_segmentation_lora.mdx b/docs/source/task_guides/semantic_segmentation_lora.md
similarity index 99%
rename from docs/source/task_guides/semantic_segmentation_lora.mdx
rename to docs/source/task_guides/semantic_segmentation_lora.md
index d029cad01f..72f59a579c 100644
--- a/docs/source/task_guides/semantic_segmentation_lora.mdx
+++ b/docs/source/task_guides/semantic_segmentation_lora.md
@@ -8,6 +8,10 @@ http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
 -->
 
 # Semantic segmentation using LoRA
diff --git a/docs/source/task_guides/seq2seq-prefix-tuning.mdx b/docs/source/task_guides/seq2seq-prefix-tuning.md
similarity index 98%
rename from docs/source/task_guides/seq2seq-prefix-tuning.mdx
rename to docs/source/task_guides/seq2seq-prefix-tuning.md
index 993ec74a2a..1eea24bc62 100644
--- a/docs/source/task_guides/seq2seq-prefix-tuning.mdx
+++ b/docs/source/task_guides/seq2seq-prefix-tuning.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # Prefix tuning for conditional generation
 
 [[open-in-colab]]
diff --git a/docs/source/task_guides/token-classification-lora.mdx b/docs/source/task_guides/token-classification-lora.md
similarity index 98%
rename from docs/source/task_guides/token-classification-lora.mdx
rename to docs/source/task_guides/token-classification-lora.md
index 05cf461c24..6c2f301940 100644
--- a/docs/source/task_guides/token-classification-lora.mdx
+++ b/docs/source/task_guides/token-classification-lora.md
@@ -1,3 +1,7 @@
+<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
 # LoRA for token classification
 
 Low-Rank Adaptation (LoRA) is a reparametrization method that aims to reduce the number of trainable parameters with low-rank representations. The weight matrix is broken down into low-rank matrices that are trained and updated. All the pretrained model parameters remain frozen. After training, the low-rank matrices are added back to the original weights. This makes it more efficient to store and train a LoRA model because there are significantly fewer parameters.

From c0dd27bc974e4a62c6072142146887b75bb2de6c Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 23 Nov 2023 15:40:35 +0100
Subject: [PATCH 52/65] Fix dockerfile build (#1177)

* Update Dockerfile

* Update build_docker_images.yml

* Update Dockerfile

* Update build_docker_images.yml
---
 docker/peft-gpu/Dockerfile | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
index 52af326eed..925ade2e5a 100644
--- a/docker/peft-gpu/Dockerfile
+++ b/docker/peft-gpu/Dockerfile
@@ -40,6 +40,12 @@ SHELL ["/bin/bash", "-c"]
 RUN source activate peft && \ 
     python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
 
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
 # Activate the conda env and install transformers + accelerate from source
 RUN source activate peft && \
     python3 -m pip install -U --no-cache-dir \
@@ -50,13 +56,8 @@ RUN source activate peft && \
     git+https://github.com/huggingface/accelerate \
     peft[test]@git+https://github.com/huggingface/peft
 
-RUN pip freeze | grep transformers
-
-# Install apt libs
-RUN apt-get update && \
-    apt-get install -y curl git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
+RUN source activate peft && \ 
+    pip freeze | grep transformers
 
 RUN echo "source activate peft" >> ~/.profile
 

From 19145bba8af953b709fe9ddfff1e0c5000172bec Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Fri, 24 Nov 2023 17:03:59 +0100
Subject: [PATCH 53/65] FIX Wrong use of base layer (#1183)

This is important if we have nested adapter layers. This was an overlook
during the refactoring #1106.
---
 src/peft/tuners/lycoris_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/peft/tuners/lycoris_utils.py b/src/peft/tuners/lycoris_utils.py
index 5865887506..8f68ce6bd6 100644
--- a/src/peft/tuners/lycoris_utils.py
+++ b/src/peft/tuners/lycoris_utils.py
@@ -159,7 +159,7 @@ def unmerge(self) -> None:
         while len(self.merged_adapters) > 0:
             active_adapter = self.merged_adapters.pop()
             if active_adapter in self._available_adapters:
-                self.base_layer.weight.data -= self.get_delta_weight(active_adapter)
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
 
     def unscale_layer(self, scale=None) -> None:
         for active_adapter in self.active_adapters:

From b4faffea8ae031e5bd69a76b55418b3650c04c80 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 24 Nov 2023 18:40:19 +0100
Subject: [PATCH 54/65] [`Tests`] Migrate to AWS runners (#1185)

* migrate single-gpu runners

* Update nightly.yml

* Update nightly.yml

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
---
 .github/workflows/nightly.yml | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 4183b7bf4e..694c2be357 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -17,7 +17,7 @@ jobs:
   run_all_tests_single_gpu:
     strategy:
       fail-fast: false
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
       TEST_TYPE: "single_gpu"
@@ -26,14 +26,12 @@ jobs:
       options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
     defaults:
       run:
-        working-directory: peft/
         shell: bash
     steps:
-      - name: Update clone & pip install
+      - uses: actions/checkout@v3
+      - name: Pip install
         run: |
           source activate peft
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }} 
           pip install -e . --no-deps
           pip install pytest-reportlog
       
@@ -61,7 +59,7 @@ jobs:
   run_all_tests_multi_gpu:
     strategy:
       fail-fast: false
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"
       TEST_TYPE: "multi_gpu"
@@ -70,14 +68,12 @@ jobs:
       options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
     defaults:
       run:
-        working-directory: peft/
         shell: bash
     steps:
-      - name: Update clone
+      - uses: actions/checkout@v3
+      - name: Pip install
         run: |
           source activate peft
-          git config --global --add safe.directory '*'
-          git fetch && git checkout ${{ github.sha }}
           pip install -e . --no-deps
           pip install pytest-reportlog
 

From e35d46de19ec8b3b20f7d5d2f9d0c8743bce6631 Mon Sep 17 00:00:00 2001
From: Merve Noyan <merveenoyan@gmail.com>
Date: Mon, 27 Nov 2023 22:29:11 +0100
Subject: [PATCH 55/65] Fix code example in quicktour.md (#1181)

---
 docs/source/quicktour.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 346fc2de0d..a6678f59a8 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -89,23 +89,24 @@ This only saves the incremental 🤗 PEFT weights that were trained, meaning it
 Easily load your model for inference using the [`~transformers.PreTrainedModel.from_pretrained`] function:
 
 ```diff
-  from transformers import AutoModelForSeq2SeqLM
+  from transformers import AutoModelForCausalLM, AutoTokenizer
 + from peft import PeftModel, PeftConfig
 
-+ peft_model_id = "smangrul/twitter_complaints_bigscience_T0_3B_LORA_SEQ_2_SEQ_LM"
++ peft_model_id = "merve/Mistral-7B-Instruct-v0.2"
 + config = PeftConfig.from_pretrained(peft_model_id)
-  model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
+  model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
 + model = PeftModel.from_pretrained(model, peft_model_id)
   tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
 
   model = model.to(device)
   model.eval()
-  inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label :", return_tensors="pt")
+  inputs = tokenizer("Tell me the recipe for chocolate chip cookie", return_tensors="pt")
 
   with torch.no_grad():
       outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=10)
       print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])
-  'complaint'
+  'Tell me the recipe for chocolate chip cookie dough.
+  1. Preheat oven'
 ```
 
 ## Easy loading with Auto classes 
@@ -146,4 +147,4 @@ peft_model_id = "smangrul/openai-whisper-large-v2-LORA-colab"
 
 Now that you've seen how to train a model with one of the 🤗 PEFT methods, we encourage you to try out some of the other methods like prompt tuning. The steps are very similar to the ones shown in this quickstart; prepare a [`PeftConfig`] for a 🤗 PEFT method, and use the `get_peft_model` to create a [`PeftModel`] from the configuration and base model. Then you can train it however you like!
 
-Feel free to also take a look at the task guides if you're interested in training a model with a 🤗 PEFT method for a specific task such as semantic segmentation, multilingual automatic speech recognition, DreamBooth, and token classification.
\ No newline at end of file
+Feel free to also take a look at the task guides if you're interested in training a model with a 🤗 PEFT method for a specific task such as semantic segmentation, multilingual automatic speech recognition, DreamBooth, and token classification.

From e586f96740a9936ce46c33f00980c04215fe4239 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:04:57 +0100
Subject: [PATCH 56/65] DOC Update a few places in the README (#1152)

- fix bits_and_bytes => bitsandbytes
- add a few links
- add mistral to list of supported models
---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d4dfee5c38..445cb26539 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ Try out the 🤗 Gradio Space which should run seamlessly on a T4 instance:
 - Here is an example in [trl](https://github.com/lvwerra/trl) library using PEFT+INT8 for tuning policy model: [gpt2-sentiment_peft.py](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment_peft.py) and corresponding [Blog](https://huggingface.co/blog/trl-peft)
 - Example using PEFT for Instruction finetuning, reward model and policy : [stack_llama](https://github.com/lvwerra/trl/tree/main/examples/research_projects/stack_llama/scripts) and corresponding [Blog](https://huggingface.co/blog/stackllama) 
 
-### INT8 training of large models in Colab using PEFT LoRA and bits_and_bytes
+### INT8 training of large models in Colab using PEFT LoRA and bitsandbytes
 
 - Here is now a demo on how to fine tune [OPT-6.7b](https://huggingface.co/facebook/opt-6.7b) (14GB in fp16) in a Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing)
 
@@ -223,11 +223,13 @@ DeepSpeed version required `v0.8.0`. An example is provided in `~examples/condit
   ```
 
 ### Example of PEFT model inference using 🤗 Accelerate's Big Model Inferencing capabilities
-An example is provided in `~examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb`. 
+An example is provided in [this notebook](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb).
 
 
 ## Models support matrix
 
+Find models that are supported out of the box below. Note that PEFT works with almost all models -- if it is not listed, you just need to [do some manual configuration](https://huggingface.co/docs/peft/developer_guides/custom_models).
+
 ### Causal Language Modeling
 | Model        | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
 |--------------| ---- | ---- | ---- | ----  | ----  |
@@ -239,6 +241,7 @@ An example is provided in `~examples/causal_language_modeling/peft_lora_clm_acce
 | GPT-NeoX-20B | ✅  | ✅  | ✅  | ✅  | ✅  |
 | LLaMA        | ✅  | ✅  | ✅  | ✅  | ✅  |
 | ChatGLM      | ✅  | ✅  | ✅  | ✅  | ✅  |
+| Mistral      | ✅  |    |    |    |    |
 
 ### Conditional Generation
 |   Model         | LoRA | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
@@ -396,6 +399,8 @@ dummy_inputs = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]])
 dummy_outputs = model(dummy_inputs)
 ```
 
+Learn more about the [low level API in the docs](https://huggingface.co/docs/peft/developer_guides/low_level_api).
+
 ## Contributing
 
 If you would like to contribute to PEFT, please check out our [contributing guide](https://huggingface.co/docs/peft/developer_guides/contributing).

From 64c8d1da8593bf4f9b736d26d254dfc016a522ea Mon Sep 17 00:00:00 2001
From: elyxlz <58893694+elyxlz@users.noreply.github.com>
Date: Tue, 28 Nov 2023 13:17:25 +0000
Subject: [PATCH 57/65] FIX Pass HF token when calling
 PeftModel.from_pretrained (#1076)

---
 src/peft/peft_model.py          | 1 +
 src/peft/utils/save_and_load.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index e0f0977e28..c5c7825baa 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -308,6 +308,7 @@ def from_pretrained(
                     revision=kwargs.get("revision", None),
                     cache_dir=kwargs.get("cache_dir", None),
                     use_auth_token=kwargs.get("use_auth_token", None),
+                    token=kwargs.get("token", None),
                 )
             ].from_pretrained(model_id, **kwargs)
         elif isinstance(config, PeftConfig):
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 75e934747e..07e653bef1 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -193,11 +193,16 @@ def load_peft_weights(model_id: str, device: Optional[str] = None, **hf_hub_down
         filename = os.path.join(path, WEIGHTS_NAME)
         use_safetensors = False
     else:
+        token = hf_hub_download_kwargs.get("token", None)
+        if token is None:
+            token = hf_hub_download_kwargs.get("use_auth_token", None)
+
         has_remote_safetensors_file = file_exists(
             repo_id=model_id,
             filename=SAFETENSORS_WEIGHTS_NAME,
             revision=hf_hub_download_kwargs.get("revision", None),
             repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            token=token,
         )
         use_safetensors = has_remote_safetensors_file
 

From da29ae62d4d637936f906f102fc16530b4a80d95 Mon Sep 17 00:00:00 2001
From: Umar Butler <umar@umar.au>
Date: Wed, 29 Nov 2023 00:43:06 +1100
Subject: [PATCH 58/65] ENH Add support for phi model architecture (#1186)

---
 src/peft/utils/other.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 50f22a5523..e811bee5ba 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -507,6 +507,7 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     "codegen": ["qkv_proj"],
     "mistral": ["q_proj", "v_proj"],
     "stablelm": ["q_proj", "v_proj"],
+    "phi": ["Wqkv", "out_proj", "fc1", "fc2"],
 }
 
 TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = {

From 04c411010b14492b5bc92e3f13108d98ee0e26e2 Mon Sep 17 00:00:00 2001
From: Jialong Wu <wujialongml@gmail.com>
Date: Tue, 28 Nov 2023 23:04:52 +0800
Subject: [PATCH 59/65] Examples: add options to save or push model (#1159)

---
 ...a_clm_accelerate_big_model_inference.ipynb | 18 +++++++++-
 ...ft_lora_clm_accelerate_ds_zero3_offload.py | 19 +++++++---
 .../peft_prefix_tuning_clm.ipynb              | 35 +++++++++++++++++--
 .../peft_prompt_tuning_clm.ipynb              | 35 +++++++++++++++++--
 ...ora_seq2seq_accelerate_ds_zero3_offload.py | 20 ++++++++---
 .../peft_lora_seq2seq_accelerate_fsdp.py      | 18 +++++++---
 6 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
index 5d05e1441a..3e2493a391 100644
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
@@ -210,6 +210,23 @@
     "print(next(iter(test_dataloader)))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "42b14a11",
+   "metadata": {},
+   "source": [
+    "You can load model from hub or local\n",
+    "\n",
+    "- Load model from Hugging Face Hub, you can change to your own model id\n",
+    "```python\n",
+    "peft_model_id = \"username/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
+    "```\n",
+    "- Or load model form local\n",
+    "```python\n",
+    "peft_model_id = \"twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -244,7 +261,6 @@
     "\n",
     "max_memory = {0: \"1GIB\", 1: \"1GIB\", 2: \"2GIB\", 3: \"10GIB\", \"cpu\": \"30GB\"}\n",
     "peft_model_id = \"smangrul/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
-    "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map=\"auto\", max_memory=max_memory)\n",
     "model = PeftModel.from_pretrained(model, peft_model_id, device_map=\"auto\", max_memory=max_memory)"
diff --git a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
index 224d17d813..2b7fcf23b5 100644
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@@ -349,12 +349,21 @@ def test_preprocess_function(examples):
         pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
 
     accelerator.wait_for_everyone()
-    model.push_to_hub(
-        "smangrul/"
-        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-        state_dict=accelerator.get_state_dict(model),
-        use_auth_token=True,
+    # Option1: Pushing the model to Hugging Face Hub
+    # model.push_to_hub(
+    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+    #     token = "hf_..."
+    # )
+    # token (`bool` or `str`, *optional*):
+    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+    #     is not specified.
+    #     Or you can get your token from https://huggingface.co/settings/token
+    # Option2: Saving the model locally
+    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
+        "/", "_"
     )
+    model.save_pretrained(peft_model_id)
     accelerator.wait_for_everyone()
 
 
diff --git a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
index ef0109bf4a..607b5291a3 100644
--- a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
@@ -1228,6 +1228,33 @@
     "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0e21c49b",
+   "metadata": {},
+   "source": [
+    "You can push model to hub or save model locally. \n",
+    "\n",
+    "- Option1: Pushing the model to Hugging Face Hub\n",
+    "```python\n",
+    "model.push_to_hub(\n",
+    "    f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n",
+    "    token = \"hf_...\"\n",
+    ")\n",
+    "```\n",
+    "token (`bool` or `str`, *optional*):\n",
+    "    `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n",
+    "    when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n",
+    "    is not specified.\n",
+    "    Or you can get your token from https://huggingface.co/settings/token\n",
+    "```\n",
+    "- Or save model locally\n",
+    "```python\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n",
+    "model.save_pretrained(peft_model_id)\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 16,
@@ -1236,7 +1263,9 @@
    "outputs": [],
    "source": [
     "# saving model\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "model.save_pretrained(peft_model_id)"
    ]
   },
@@ -1260,7 +1289,9 @@
    "source": [
     "from peft import PeftModel, PeftConfig\n",
     "\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n",
diff --git a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
index b0a7e26689..948244b671 100644
--- a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
@@ -1072,6 +1072,33 @@
     "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c8f35152",
+   "metadata": {},
+   "source": [
+    "You can push model to hub or save model locally. \n",
+    "\n",
+    "- Option1: Pushing the model to Hugging Face Hub\n",
+    "```python\n",
+    "model.push_to_hub(\n",
+    "    f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n",
+    "    token = \"hf_...\"\n",
+    ")\n",
+    "```\n",
+    "token (`bool` or `str`, *optional*):\n",
+    "    `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n",
+    "    when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n",
+    "    is not specified.\n",
+    "    Or you can get your token from https://huggingface.co/settings/token\n",
+    "```\n",
+    "- Or save model locally\n",
+    "```python\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n",
+    "model.save_pretrained(peft_model_id)\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -1080,7 +1107,9 @@
    "outputs": [],
    "source": [
     "# saving model\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "model.save_pretrained(peft_model_id)"
    ]
   },
@@ -1116,7 +1145,9 @@
    "source": [
     "from peft import PeftModel, PeftConfig\n",
     "\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n",
diff --git a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
index 36df125cc5..dc202580b0 100644
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
@@ -298,12 +298,22 @@ def collate_fn(examples):
         pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
 
     accelerator.wait_for_everyone()
-    model.push_to_hub(
-        "smangrul/"
-        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-        state_dict=accelerator.get_state_dict(model),
-        use_auth_token=True,
+    # Option1: Pushing the model to Hugging Face Hub
+    # model.push_to_hub(
+    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+    #     token = "hf_..."
+    # )
+    # token (`bool` or `str`, *optional*):
+    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+    #     is not specified.
+    #     Or you can get your token from https://huggingface.co/settings/token
+
+    # Option2: Saving the model locally
+    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
+        "/", "_"
     )
+    model.save_pretrained(peft_model_id)
     accelerator.wait_for_everyone()
 
 
diff --git a/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
index c2146a5dce..9c60fd8057 100644
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
@@ -125,11 +125,19 @@ def preprocess_function(examples):
         accelerator.print(f"{eval_preds[:10]=}")
         accelerator.print(f"{dataset['validation'][label_column][:10]=}")
         accelerator.wait_for_everyone()
-        model.push_to_hub(
-            "smangrul/" + f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-            state_dict=accelerator.get_state_dict(model),
-            use_auth_token=True,
-        )
+        # Option1: Pushing the model to Hugging Face Hub
+        # model.push_to_hub(
+        #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+        #     token = "hf_..."
+        # )
+        # token (`bool` or `str`, *optional*):
+        #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+        #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+        #     is not specified.
+        #     Or you can get your token from https://huggingface.co/settings/token
+        # Option2: Saving the model locally
+        peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_")
+        model.save_pretrained(peft_model_id)
         accelerator.wait_for_everyone()
 
 

From f0fb9516d8ebc4fed1bacaa11b789be023f003f8 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 29 Nov 2023 12:37:39 +0100
Subject: [PATCH 60/65] ENH: Different initialization methods for LoRA (#1189)

This PR adds the possibility to use different initialization methods for
LoRA, as is a requirement for a completely backwards compatible adoption
of PEFT in diffusers.

The default is still the same as always, namely the one from the
reference implementation by Microsoft. On top of that, it is now
possible to pass `init_lora_weights='gaussian'` to initialize the LoRA
weights in the same way as is default for diffusers, namely with a
normal distribution which is scaled by 1/r.

The init method currently applies to LoRA linear and conv layers, but
not embedding layers, which are always initialized from a normal
distribution (and are probably irrelevant for diffusers).

In the future, similar extensions could be added for other adapter
methods.
---
 setup.py                       |   4 +-
 src/peft/tuners/lora/config.py |  12 +-
 src/peft/tuners/lora/layer.py  |  22 +++-
 tests/test_initialization.py   | 232 +++++++++++++++++++++++++++++++++
 4 files changed, 258 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_initialization.py

diff --git a/setup.py b/setup.py
index 8e3d60ec7c..7f5e55524f 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,9 @@
 extras["quality"] = ["black ~= 22.0", "ruff>=0.0.241", "urllib3<=2.0.0"]
 extras["docs_specific"] = ["hf-doc-builder"]
 extras["dev"] = extras["quality"] + extras["docs_specific"]
-extras["test"] = extras["dev"] + ["pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers<0.21.0"]
+extras["test"] = extras["dev"] + [
+    "pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers<0.21.0", "scipy"
+]
 
 setup(
     name="peft",
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 2412b61a1a..b1e31d8198 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import List, Literal, Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -76,12 +78,14 @@ class LoraConfig(PeftConfig):
             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
         },
     )
-    init_lora_weights: bool = field(
+    init_lora_weights: bool | Literal["gaussian"] = field(
         default=True,
         metadata={
             "help": (
-                "Whether to initialize the weights of the Lora layers with their default initialization. Don't change "
-                "this setting, except if you know exactly what you're doing."
+                "How to initialize the weights of the LoRA layers. Passing True (default) results in the default "
+                "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
+                "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
+                "to False leads to completely random initialization and is discouraged."
             ),
         },
     )
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index c263053183..5ea726d2ff 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -84,7 +84,7 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
             self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
             self.scaling[adapter_name] = lora_alpha / r
         if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(self.get_base_layer(), "weight", None)
         if weight is not None:
@@ -116,7 +116,7 @@ def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lo
             self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
             self.scaling[adapter_name] = lora_alpha / r
         if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(base_layer, "weight", None)
         if weight is not None:
@@ -142,8 +142,7 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
             self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+        self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         base_layer = self.get_base_layer()
         weight = getattr(base_layer, "weight", None)
@@ -152,10 +151,19 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.to(base_layer.weight.device, dtype=weight.dtype)
         self.set_adapter(self.active_adapters)
 
-    def reset_lora_parameters(self, adapter_name):
+    def reset_lora_parameters(self, adapter_name, init_lora_weights):
+        if init_lora_weights is False:
+            return
+
         if adapter_name in self.lora_A.keys():
-            # initialize A the same way as the default for nn.Linear and B to zero
-            nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            if init_lora_weights is True:
+                # initialize A the same way as the default for nn.Linear and B to zero
+                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
+                nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            elif init_lora_weights.lower() == "gaussian":
+                nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
+            else:
+                raise ValueError(f"Unknown initialization {init_lora_weights=}")
             nn.init.zeros_(self.lora_B[adapter_name].weight)
         if adapter_name in self.lora_embedding_A.keys():
             # initialize a the same way as the default for nn.linear and b to zero
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
new file mode 100644
index 0000000000..3770b4a74f
--- /dev/null
+++ b/tests/test_initialization.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from scipy import stats
+from torch import nn
+
+from peft import LoraConfig, get_peft_model
+from peft.utils import infer_device
+
+
+class InitializationTest(unittest.TestCase):
+    """Test class to check the initialization of adapters."""
+
+    torch_device = infer_device()
+
+    def get_uniform(self, amin, amax, size=(10000,)):
+        unif = torch.distributions.uniform.Uniform(amin, amax)
+        samples = unif.sample(size)
+        return samples
+
+    def get_normal(self, mean, std, size=(10000,)):
+        normal = torch.distributions.normal.Normal(mean, std)
+        samples = normal.sample(size)
+        return samples
+
+    def get_model(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # choose a large weight so that averages are close to expected values
+                self.linear = nn.Linear(1000, 1000)
+                self.embed = nn.Embedding(1000, 1000)
+                self.conv2d = nn.Conv2d(100, 100, 3)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        return MyModule().eval().to(self.torch_device)
+
+    def test_lora_linear_init_default(self):
+        # default is True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"])
+        model = get_peft_model(model, config)
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a normal distribution
+        normal = self.get_normal(weight_A.mean().item(), weight_A.std().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_linear_init_gaussian(self):
+        # use gaussian init
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a normal distribution
+        normal = self.get_normal(0.0, 1 / config.r)
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+
+        # import matplotlib.pyplot as plt
+        # x = weight_A.detach().flatten().cpu().numpy()
+        # breakpoint()
+
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_linear_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_B = model.linear.lora_B["default"].weight
+
+        # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_B, torch.zeros_like(weight_B)))
+
+    def test_lora_embedding_default(self):
+        # embedding is initialized as a normal distribution, not kaiming uniform
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"])
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_A["default"]
+        weight_B = model.embed.lora_embedding_B["default"]
+
+        # use statistical test to check if weight B is from a normal distribution
+        normal = self.get_normal(0.0, 1.0)
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight B is *not* from a uniform distribution
+        unif = self.get_uniform(weight_B.min().item(), weight_B.max().item())
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight A is zero
+        self.assertTrue((weight_A == 0.0).all())
+
+    def test_lora_embedding_gaussian(self):
+        # embedding does not change with init_lora_weights="gaussian" vs True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_A["default"]
+        weight_B = model.embed.lora_embedding_B["default"]
+
+        # use statistical test to check if weight B is from a normal distribution
+        normal = self.get_normal(0.0, 1.0)
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight B is *not* from a uniform distribution
+        unif = self.get_uniform(weight_B.min().item(), weight_B.max().item())
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight A is zero
+        self.assertTrue((weight_A == 0.0).all())
+
+    def test_lora_embedding_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_B["default"]
+
+        # with init_lora_weights=False, weight A should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_A, torch.zeros_like(weight_A)))
+
+    def test_lora_conv2d_default(self):
+        # default is True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"])
+        model = get_peft_model(model, config)
+        weight_A = model.conv2d.lora_A["default"].weight
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a normal distribution
+        normal = self.get_normal(weight_A.mean().item(), weight_A.std().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_conv2d_init_gaussian(self):
+        # use gaussian init
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.conv2d.lora_A["default"].weight
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a normal distribution
+        normal = self.get_normal(0.0, 1 / config.r)
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_conv2d_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_B, torch.zeros_like(weight_B)))

From 8298f1a3668604ac9bc3f6e28b24e8eb554891a1 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:28:41 +0530
Subject: [PATCH 61/65] Training PEFT models with new tokens being added to the
 embedding layers and tokenizer (#1147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add support for saving base layers weights along with adapter weights

* Update save_and_load.py

* Add an example showing the usage of the added feature

* refactor the functionality

* fix

* refactoring code

1. Add `is_embedding_layer_resized` parameter to `save_pretrained`
2. Fix the deduplication in README when adding PEFT details.
3. `save_pretrained` should only save the model when `is_main_process=True` which is one of the parameters of `save_pretrained`.

* update example

* fix the model card

* fix model card

* 😅

* fix model card

* automate setting `is_embedding_layer_resized`

* nits

* Update peft_lora_clm_with_additional_tokens.ipynb

* add test

* fix tests

* maybe fixes the issue?

* address comments

Co-Authored-By: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 ...peft_lora_clm_with_additional_tokens.ipynb | 1012 +++++++++++++++++
 src/peft/peft_model.py                        |   52 +-
 src/peft/utils/other.py                       |    1 +
 src/peft/utils/save_and_load.py               |   45 +-
 tests/test_custom_models.py                   |   76 ++
 5 files changed, 1167 insertions(+), 19 deletions(-)
 create mode 100644 examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb

diff --git a/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb b/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
new file mode 100644
index 0000000000..81762de08c
--- /dev/null
+++ b/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
@@ -0,0 +1,1012 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5f239612-620e-4430-8685-9fdc6b179b41",
+   "metadata": {},
+   "source": [
+    "# Training PEFT models with new tokens being added to the embedding layers and tokenizer\n",
+    "\n",
+    "In this example, we will learn how to train a LoRA model when adding new tokens to the tokenizer and model. \n",
+    "This is a common usecase when doing the following:\n",
+    "1. Instruction finetuning with new tokens beind added such as `<|user|>`, `<|assistant|>`, `<|system|>`, `</s>`, `<s>` to properly format the conversations\n",
+    "2. Finetuning on a specific language wherein language spoecific tokens are added, e.g., korean tokens being added to vocabulary for finetuning LLM on Korean datasets.\n",
+    "3. Instruction finetuning to return outputs in certain format to enable agent behaviour new tokens such as `<|FUNCTIONS|>`, `<|BROWSE|>`, `<|TEXT2IMAGE|>`, `<|ASR|>`, `<|TTS|>`, `<|GENERATECODE|>`, `<|RAG|>`.\n",
+    "\n",
+    "In such cases, you add the Embedding modules to the LORA `target_modules`. PEFT will take care of saving the embedding layers with the new added tokens along with the adapter weights that were trained on the specific initialization of the embeddings weights of the added tokens."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b27c55e8-edaa-4059-90bc-d6096d596902",
+   "metadata": {},
+   "source": [
+    "Let's import the necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6f864c90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
+    "os.environ[\"WANDB_PROJECT\"] = \"PeftExamples\"\n",
+    "import transformers\n",
+    "from peft import (\n",
+    "    LoraConfig,\n",
+    "    PeftConfig,\n",
+    "    PeftModel,\n",
+    "    get_peft_model,\n",
+    "    prepare_model_for_int8_training,\n",
+    ")\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    HfArgumentParser,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    default_data_collator,\n",
+    ")\n",
+    "import torch\n",
+    "from dataclasses import dataclass, field\n",
+    "from typing import Optional\n",
+    "from dataclass_csv import DataclassReader\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "from enum import Enum"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74950a3f-bb63-4ce5-9e2b-1b83f92b13a2",
+   "metadata": {},
+   "source": [
+    "## Prepare Model and Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76763f5e-64b2-409b-8845-ae5589f8a4e0",
+   "metadata": {},
+   "source": [
+    "Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fd0498ea-547e-418d-bf13-c9abafdd5476",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SpecialTokens(str, Enum):\n",
+    "    begin_target = \"<|begintarget|>\"\n",
+    "    end_target = \"<|endtarget|>\"\n",
+    "    begin_context = \"<|begincontext|>\"\n",
+    "    end_context = \"<|endcontext|>\"\n",
+    "    system = \"<|system|>\"\n",
+    "    user = \"<|user|>\"\n",
+    "    begin_last_user_utterance = \"<|beginlastuserutterance|>\"\n",
+    "    end_last_user_utterance = \"<|endlastuserutterance|>\"\n",
+    "    begin_dsts = \"<|begindsts|>\"\n",
+    "    end_dsts = \"<|enddsts|>\"\n",
+    "    begin_dst = \"<|begindst|>\"\n",
+    "    end_dst = \"<|enddst|>\"\n",
+    "    begin_belief = \"<|beginbelief|>\"\n",
+    "    end_belief = \"<|endbelief|>\"\n",
+    "    begin_response = \"<|beginresponse|>\"\n",
+    "    end_response = \"<|endresponse|>\"\n",
+    "    begin_action = \"<|beginaction|>\"\n",
+    "    end_action = \"<|endaction|>\"\n",
+    "    begin_user_action = \"<|beginuseraction|>\"\n",
+    "    end_user_action = \"<|enduseraction|>\"\n",
+    "    sys_actions = \"<|sysactions|>\"\n",
+    "    begin_intent = \"<|beginintent|>\"\n",
+    "    end_intent = \"<|endintent|>\"\n",
+    "    begin_requested_slots = \"<|beginrequestedslots|>\"\n",
+    "    end_requested_slots = \"<|endrequestedslots|>\"\n",
+    "    pad_token = \"<|pad|>\"\n",
+    "    bos_token = \"<|startoftext|>\"\n",
+    "\n",
+    "    @classmethod\n",
+    "    def list(cls):\n",
+    "        return [c.value for c in cls]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae4a4255-5f13-4eef-a024-4f1de0f2173b",
+   "metadata": {},
+   "source": [
+    "We will be finetuning Mistral-7B model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f0eedef9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91c67b6377fc4dd7977bf544de784d51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Embedding(32027, 4096)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name = \"mistralai/Mistral-7B-v0.1\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_name,\n",
+    "    pad_token=SpecialTokens.pad_token.value,\n",
+    "    bos_token=SpecialTokens.bos_token.value,\n",
+    "    eos_token=SpecialTokens.end_target.value,\n",
+    "    additional_special_tokens=SpecialTokens.list(),\n",
+    ")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    low_cpu_mem_usage=True\n",
+    "    # use_flash_attention_2=True, # leading to an error\n",
+    ")\n",
+    "model.resize_token_embeddings(len(tokenizer))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88439ed6-9974-4918-80df-ec78b05b4185",
+   "metadata": {},
+   "source": [
+    "## Apply LoRA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "80967087",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 31,886,720 || all params: 7,273,840,000 || trainable%: 0.43837532857472805\n",
+      "None\n",
+      "PeftModel(\n",
+      "  (base_model): LoraModel(\n",
+      "    (model): MistralForCausalLM(\n",
+      "      (model): MistralModel(\n",
+      "        (embed_tokens): lora.Embedding(\n",
+      "          (base_layer): Embedding(32027, 4096)\n",
+      "          (lora_dropout): ModuleDict(\n",
+      "            (default): Identity()\n",
+      "          )\n",
+      "          (lora_A): ModuleDict()\n",
+      "          (lora_B): ModuleDict()\n",
+      "          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 64x32027])\n",
+      "          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 4096x64])\n",
+      "        )\n",
+      "        (layers): ModuleList(\n",
+      "          (0-31): 32 x MistralDecoderLayer(\n",
+      "            (self_attn): MistralAttention(\n",
+      "              (q_proj): lora.Linear(\n",
+      "                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+      "                (lora_dropout): ModuleDict(\n",
+      "                  (default): Identity()\n",
+      "                )\n",
+      "                (lora_A): ModuleDict(\n",
+      "                  (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "                )\n",
+      "                (lora_B): ModuleDict(\n",
+      "                  (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+      "                )\n",
+      "                (lora_embedding_A): ParameterDict()\n",
+      "                (lora_embedding_B): ParameterDict()\n",
+      "              )\n",
+      "              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+      "              (v_proj): lora.Linear(\n",
+      "                (base_layer): Linear(in_features=4096, out_features=1024, bias=False)\n",
+      "                (lora_dropout): ModuleDict(\n",
+      "                  (default): Identity()\n",
+      "                )\n",
+      "                (lora_A): ModuleDict(\n",
+      "                  (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "                )\n",
+      "                (lora_B): ModuleDict(\n",
+      "                  (default): Linear(in_features=64, out_features=1024, bias=False)\n",
+      "                )\n",
+      "                (lora_embedding_A): ParameterDict()\n",
+      "                (lora_embedding_B): ParameterDict()\n",
+      "              )\n",
+      "              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+      "              (rotary_emb): MistralRotaryEmbedding()\n",
+      "            )\n",
+      "            (mlp): MistralMLP(\n",
+      "              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+      "              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+      "              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+      "              (act_fn): SiLU()\n",
+      "            )\n",
+      "            (input_layernorm): MistralRMSNorm()\n",
+      "            (post_attention_layernorm): MistralRMSNorm()\n",
+      "          )\n",
+      "        )\n",
+      "        (norm): MistralRMSNorm()\n",
+      "      )\n",
+      "      (lm_head): lora.Linear(\n",
+      "        (base_layer): Linear(in_features=4096, out_features=32027, bias=False)\n",
+      "        (lora_dropout): ModuleDict(\n",
+      "          (default): Identity()\n",
+      "        )\n",
+      "        (lora_A): ModuleDict(\n",
+      "          (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "        )\n",
+      "        (lora_B): ModuleDict(\n",
+      "          (default): Linear(in_features=64, out_features=32027, bias=False)\n",
+      "        )\n",
+      "        (lora_embedding_A): ParameterDict()\n",
+      "        (lora_embedding_B): ParameterDict()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "config = LoraConfig(\n",
+    "    r=64, lora_alpha=128, lora_dropout=0.0, target_modules=[\"embed_tokens\", \"lm_head\", \"q_proj\", \"v_proj\"]\n",
+    ")\n",
+    "model = get_peft_model(model, config)\n",
+    "print(model.print_trainable_parameters())\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15ac9945-4fcb-45f4-9478-d99a25a519cc",
+   "metadata": {},
+   "source": [
+    "## Preapre Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c6980d59-42d4-4a27-84cc-a9719302088b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33d9539232da48f3ae922216b98ae462",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/986 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7a33811d93742099140240cad91b679",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/247 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"smangrul/assistant_chatbot_dataset\")\n",
+    "dataset = dataset[\"train\"].train_test_split(0.2)\n",
+    "\n",
+    "text_column = \"context\"\n",
+    "label_column = \"target\"\n",
+    "max_length = 512\n",
+    "\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    batch_size = len(examples[text_column])\n",
+    "    targets = [str(x) for x in examples[label_column]]\n",
+    "    model_inputs = tokenizer(examples[text_column])\n",
+    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
+    "    for i in range(batch_size):\n",
+    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
+    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
+    "        # print(i, sample_input_ids, label_input_ids)\n",
+    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
+    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
+    "        model_inputs[\"attention_mask\"][i] = [1] * len(model_inputs[\"input_ids\"][i])\n",
+    "    # print(model_inputs)\n",
+    "    for i in range(batch_size):\n",
+    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
+    "        label_input_ids = labels[\"input_ids\"][i]\n",
+    "        model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n",
+    "            max_length - len(sample_input_ids)\n",
+    "        ) + sample_input_ids\n",
+    "        model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n",
+    "            \"attention_mask\"\n",
+    "        ][i]\n",
+    "        labels[\"input_ids\"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids\n",
+    "        model_inputs[\"input_ids\"][i] = model_inputs[\"input_ids\"][i][:max_length]\n",
+    "        model_inputs[\"attention_mask\"][i] = model_inputs[\"attention_mask\"][i][:max_length]\n",
+    "        labels[\"input_ids\"][i] = labels[\"input_ids\"][i][:max_length]\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "\n",
+    "processed_datasets = dataset.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=False,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "train_dataset = processed_datasets[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5671b1ee-dca4-4705-8399-5c2967b9fb5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "    num_rows: 986\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3f38888e-4382-415b-869d-7202a816606a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataloader = DataLoader(\n",
+    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "53b9e552-4c5d-43e8-a9cd-8073af8d4280",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         ...,\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001]]),\n",
+       " 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         ...,\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1]]),\n",
+       " 'labels': tensor([[ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         ...,\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001]])}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "next(iter(train_dataloader))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7de31ee2-185e-4658-9ad1-ae5f6bc3a611",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|><|begincontext|><|user|> Can you find me place to eat?<|system|> What kind of food would you like to have and where would you like me to search in?<|user|> Food kind of California will be perfect in SF.<|system|> There are 10 restaurants, Al's Place is one of the good restaurant in San Francisco.<|user|> Can you look for any other restaurant?<|system|> Alta Msp is one of the good restaurant in San Francisco.<|beginlastuserutterance|> Can you find me the address?<|endlastuserutterance|><|endcontext|><|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginrequestedslots|> Restaurants^street_address<|endrequestedslots|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->California<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^street_address~1275 Minnesota Street<|endaction|><|beginresponse|> The street address of the restaurant is 1275 Minnesota Street.<|endresponse|><|endtarget|><|endtarget|>\""
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.decode(train_dataset[0][\"input_ids\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "239d1c83-196d-471e-9bf7-5f36dafa9894",
+   "metadata": {},
+   "source": [
+    "# Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ec80d6ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
+      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33msmangrul\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/raid/sourab/temp/wandb/run-20231128_230934-edod21gq</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/smangrul/PeftExamples/runs/edod21gq' target=\"_blank\">ethereal-eon-1</a></strong> to <a href='https://wandb.ai/smangrul/PeftExamples' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/smangrul/PeftExamples' target=\"_blank\">https://wandb.ai/smangrul/PeftExamples</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/smangrul/PeftExamples/runs/edod21gq' target=\"_blank\">https://wandb.ai/smangrul/PeftExamples/runs/edod21gq</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='246' max='246' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [246/246 05:51, Epoch 2/2]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>5.189800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>3.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30</td>\n",
+       "      <td>2.371500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>40</td>\n",
+       "      <td>1.630200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>1.302600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>60</td>\n",
+       "      <td>0.999400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>70</td>\n",
+       "      <td>0.704100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>80</td>\n",
+       "      <td>0.527800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>90</td>\n",
+       "      <td>0.509700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>0.382300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>110</td>\n",
+       "      <td>0.318200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>120</td>\n",
+       "      <td>0.323500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>130</td>\n",
+       "      <td>0.263400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>140</td>\n",
+       "      <td>0.290900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>0.277400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>160</td>\n",
+       "      <td>0.232800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>170</td>\n",
+       "      <td>0.223600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>180</td>\n",
+       "      <td>0.229600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>190</td>\n",
+       "      <td>0.233100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>0.210200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>210</td>\n",
+       "      <td>0.245800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>220</td>\n",
+       "      <td>0.197300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>230</td>\n",
+       "      <td>0.210100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>240</td>\n",
+       "      <td>0.209800</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=246, training_loss=0.8516577879587809, metrics={'train_runtime': 354.9013, 'train_samples_per_second': 5.556, 'train_steps_per_second': 0.693, 'total_flos': 4.318233532091597e+16, 'train_loss': 0.8516577879587809, 'epoch': 2.0})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"mistral_lora_clm_with_added_tokens\",\n",
+    "    num_train_epochs=2,\n",
+    "    save_total_limit=5,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    warmup_steps=10,\n",
+    "    weight_decay=0.0001,\n",
+    "    dataloader_drop_last=True,\n",
+    "    bf16=True,\n",
+    "    logging_steps=10,\n",
+    "    learning_rate=1e-5,\n",
+    "    gradient_checkpointing=True,\n",
+    "    gradient_checkpointing_kwargs={\"use_reentrant\": False},\n",
+    "    remove_unused_columns=False,\n",
+    "    hub_model_id=\"smangrul/mistral_lora_clm_with_added_tokens\",\n",
+    "    push_to_hub=True,\n",
+    "    hub_private_repo=True,\n",
+    ")\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    data_collator=default_data_collator,\n",
+    ")\n",
+    "# model.config.use_cache = False\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bc1cbed-4eb9-4aaa-ab5f-5b91bf432307",
+   "metadata": {},
+   "source": [
+    "# Check the model output on a sample from evaluation dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "71851793",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "context=\"<|begincontext|><|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n",
+      "\n",
+      " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> Great, the phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n",
+      "\n",
+      " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "i = random.randint(0, len(dataset[\"test\"]))\n",
+    "context = dataset[\"test\"][i][\"context\"]\n",
+    "\n",
+    "batch = tokenizer(context, return_tensors=\"pt\")\n",
+    "batch = {k: v.to(\"cuda\") for k, v in batch.items()}\n",
+    "model.eval()\n",
+    "output_tokens = model.generate(\n",
+    "    **batch,\n",
+    "    max_new_tokens=256,\n",
+    "    do_sample=True,\n",
+    "    temperature=0.2,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    ")\n",
+    "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n",
+    "target = dataset[\"test\"][i][\"target\"]\n",
+    "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f940a660-2f7c-4a3a-b412-3f037aedb890",
+   "metadata": {},
+   "source": [
+    "# Save the Adapter model "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ebe05e9-9b93-42f6-bba8-46b8cc3d100f",
+   "metadata": {},
+   "source": [
+    "When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3d7459ba-caa8-4f10-aa70-89be4541cbdf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/raid/sourab/peft/src/peft/utils/save_and_load.py:128: UserWarning: Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\n",
+      "  warnings.warn(\"Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\")\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8d23186832014f209939ab83e79da011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3d831bc7d8843038364e821aacff5f1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_model.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84cc7a2a3a474bb791d61e2357dd229e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "events.out.tfevents.1701209373.hf-dgx-01.667111.0:   0%|          | 0.00/8.52k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ce2025dd01647599c00578044512c8c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/smangrul/mistral_lora_clm_with_added_tokens/commit/60ed7ea8bef10ce46d7a64229481dd1ad0e3d1c5', commit_message='Upload model', commit_description='', oid='60ed7ea8bef10ce46d7a64229481dd1ad0e3d1c5', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub()\n",
+    "trainer.model.push_to_hub(training_args.output_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66812cc4-f9a3-46c4-bcee-0cba03950685",
+   "metadata": {},
+   "source": [
+    "# Check the model loading is working as expected and generating plausible outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "589c46d7-d567-40b4-ab7d-e0a9e1cab40e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f98524da95b64a29a9016c6067313b2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aaae3bc0f52f45bbaab60687b71fc4cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1fc5754f41784d1aba00b93551894579",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_model.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "context=\"<|begincontext|><|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n",
+      "\n",
+      " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurant<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> The phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n",
+      "\n",
+      " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from peft import PeftModel\n",
+    "\n",
+    "inference_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    low_cpu_mem_usage=True,\n",
+    "    # use_flash_attention_2=True,\n",
+    ")\n",
+    "inference_model.resize_token_embeddings(len(tokenizer))\n",
+    "\n",
+    "inference_model = PeftModel.from_pretrained(inference_model, \"smangrul/mistral_lora_clm_with_added_tokens\")\n",
+    "inference_model.to(\"cuda\")\n",
+    "inference_model.eval()\n",
+    "\n",
+    "output_tokens = inference_model.generate(\n",
+    "    **batch,\n",
+    "    max_new_tokens=256,\n",
+    "    do_sample=True,\n",
+    "    temperature=0.2,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    ")\n",
+    "\n",
+    "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n",
+    "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd57f6e8-761f-4e0b-941c-f6973e13b186",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index c5c7825baa..24ef48c22e 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -159,6 +159,8 @@ def save_pretrained(
         save_directory: str,
         safe_serialization: bool = True,
         selected_adapters: Optional[List[str]] = None,
+        save_embedding_layers: Union[str, bool] = "auto",
+        is_main_process: bool = True,
         **kwargs: Any,
     ):
         r"""
@@ -172,6 +174,14 @@ def save_pretrained(
                 exist).
             safe_serialization (`bool`, *optional*):
                 Whether to save the adapter files in safetensors format.
+            selected_adapters (`list(str)`,  *optional*):
+                A list of adapters to be saved. If `None`, will default to all adapters.
+            save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`):
+                If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common
+                embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available.
+                Based on it sets the boolean flag. This only works for 🤗 transformers models.
+            is_main_process (`bool`, *optional*):
+                Whether the process calling this is the main process or not. Will default to `True`.
             kwargs (additional keyword arguments, *optional*):
                 Additional keyword arguments passed along to the `push_to_hub` method.
         """
@@ -190,19 +200,23 @@ def save_pretrained(
                     f" {list(self.peft_config.keys())} - got {selected_adapters}."
                 )
 
-        os.makedirs(save_directory, exist_ok=True)
-        self.create_or_update_model_card(save_directory)
+        if is_main_process:
+            os.makedirs(save_directory, exist_ok=True)
+            self.create_or_update_model_card(save_directory)
 
         for adapter_name in selected_adapters:
             peft_config = self.peft_config[adapter_name]
             # save only the trainable weights
             output_state_dict = get_peft_model_state_dict(
-                self, state_dict=kwargs.get("state_dict", None), adapter_name=adapter_name
+                self,
+                state_dict=kwargs.get("state_dict", None),
+                adapter_name=adapter_name,
+                save_embedding_layers=save_embedding_layers,
             )
             output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory
             os.makedirs(output_dir, exist_ok=True)
 
-            if safe_serialization:
+            if is_main_process and safe_serialization:
                 # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
                 # Safetensors does not allow tensor aliasing.
                 # We're going to remove aliases before saving
@@ -230,7 +244,7 @@ def save_pretrained(
                     os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
                     metadata={"format": "pt"},
                 )
-            else:
+            elif is_main_process:
                 torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
 
             # save the config and change the inference mode to `True`
@@ -257,7 +271,8 @@ def save_pretrained(
             else:
                 auto_mapping_dict = None
 
-            peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
+            if is_main_process:
+                peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
             peft_config.inference_mode = inference_mode
 
     @classmethod
@@ -721,24 +736,27 @@ def create_or_update_model_card(self, output_dir: str):
         if hasattr(self.config, "quantization_config"):
             quantization_config = self.config.quantization_config.to_dict()
         training_config_text = ""
+        quantization_prefix = "The following `bitsandbytes` quantization config was used during training:"
         # Adds quantization information if it was used
         if quantization_config is not None:
-            training_config_text += "\nThe following `bitsandbytes` quantization config was used during training:\n"
+            training_config_text += f"\n{quantization_prefix}\n"
             training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()])
             training_config_text += "\n"
 
-        training_procedure_heading = "## Training procedure\n"
-        if training_procedure_heading in lines:
-            lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
-        else:
-            lines.append(f"{training_procedure_heading}\n{training_config_text}")
+        training_procedure_heading = "## Training procedure"
+        if quantization_prefix not in lines and bool(training_config_text):
+            if training_procedure_heading in lines:
+                lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
+            else:
+                lines.append(f"{training_procedure_heading}\n{training_config_text}")
 
         # Adds peft version
-        framework_block_heading = "### Framework versions\n"
-        if framework_block_heading in lines:
-            lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}\n")
-        else:
-            lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}\n")
+        framework_block_heading = "### Framework versions"
+        if f"- PEFT {__version__}" not in lines:
+            if framework_block_heading in lines:
+                lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}")
+            else:
+                lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}")
 
         card.text = "\n".join(lines)
         card.save(filename)
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index e811bee5ba..1c34701739 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -583,3 +583,4 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
 WEIGHTS_NAME = "adapter_model.bin"
 SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
 CONFIG_NAME = "adapter_config.json"
+EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"]
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 07e653bef1..97bde0d6fe 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import warnings
 from typing import Optional
 
 import torch
@@ -20,11 +21,26 @@
 from huggingface_hub.utils import EntryNotFoundError
 from safetensors.torch import load_file as safe_load_file
 
-from .other import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
+from .other import EMBEDDING_LAYER_NAMES, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
 from .peft_types import PeftType
 
 
-def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", unwrap_compiled=False):
+def has_valid_embedding_base_layer(layer):
+    """Check if the layer has an embedding base layer"""
+    return hasattr(layer, "base_layer") and isinstance(layer.base_layer, (torch.nn.Linear, torch.nn.Embedding))
+
+
+def get_embedding_layer_name(model, layer, is_prompt_learning):
+    """Get the name of the embedding module for a given layer."""
+    for name, module in model.named_modules():
+        if (is_prompt_learning and module == layer) or module == layer.base_layer:
+            return name
+    return None
+
+
+def get_peft_model_state_dict(
+    model, state_dict=None, adapter_name="default", unwrap_compiled=False, save_embedding_layers="auto"
+):
     """
     Get the state dict of the Peft model.
 
@@ -37,6 +53,10 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", un
             The name of the adapter whose state dict should be returned.
         unwrap_compiled (`bool`, *optional*, defaults to `False`):
             Whether to unwrap the model if torch.compile was used.
+        save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`):
+            If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common embedding
+            layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. Based on it
+            sets the boolean flag. This only works for 🤗 transformers models.
     """
     if unwrap_compiled:
         model = getattr(model, "_orig_mod", model)
@@ -100,6 +120,27 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", un
             if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
                 to_return[key.replace("modules_to_save.", "")] = value
 
+    # check the common embedding layers in `target_modules` to reset `save_embedding_layers` if necessary
+    if (
+        save_embedding_layers == "auto"
+        and hasattr(config, "target_modules")
+        and any(k in config.target_modules for k in EMBEDDING_LAYER_NAMES)
+    ):
+        warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
+        save_embedding_layers = True
+    elif save_embedding_layers == "auto":
+        save_embedding_layers = False
+
+    if save_embedding_layers and hasattr(model, "get_input_embeddings"):
+        for layer in [model.get_input_embeddings(), model.get_output_embeddings()]:
+            if config.is_prompt_learning or has_valid_embedding_base_layer(layer):
+                # support from version >= 0.6.2
+                embedding_module_name = get_embedding_layer_name(model, layer, config.is_prompt_learning)
+                if embedding_module_name:
+                    to_return.update({k: v for k, v in state_dict.items() if embedding_module_name in k})
+    elif save_embedding_layers:
+        warnings.warn("Could not identify embedding layer(s) because the model is not a 🤗 transformers model.")
+
     to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
     return to_return
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 347df218b2..b298388a84 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -333,6 +333,33 @@ def forward(self, X):
         return X
 
 
+class ModelEmbWithEmbeddingUtils(nn.Module):
+    # Adds `get_input_embeddings` and `get_output_embeddings` methods to mimic 🤗 transformers models
+    def __init__(self):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(100, 5)
+        self.conv1d = Conv1D(1, 5)
+        self.relu = nn.ReLU()
+        self.flat = nn.Flatten()
+        self.lin0 = nn.Linear(10, 2)
+        self.sm = nn.LogSoftmax(dim=-1)
+
+    def forward(self, X):
+        X = self.embed_tokens(X)
+        X = self.conv1d(X)
+        X = self.relu(X)
+        X = self.flat(X)
+        X = self.lin0(X)
+        X = self.sm(X)
+        return X
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def get_output_embeddings(self):
+        return None
+
+
 class ModelConv2D(nn.Module):
     def __init__(self):
         super().__init__()
@@ -750,6 +777,55 @@ def test_non_existing_model_card(self):
         # rough check that the model card is pre-filled
         self.assertGreater(len(model_card), 1000)
 
+    @parameterized.expand(["auto", True, False])
+    def test_targeting_lora_to_embedding_layer(self, save_embedding_layers):
+        model = ModelEmbWithEmbeddingUtils()
+        config = LoraConfig(target_modules=["embed_tokens", "lin0"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            if save_embedding_layers == "auto":
+                # assert warning
+                msg_start = "Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`."
+                with self.assertWarns(UserWarning, msg=msg_start):
+                    model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            else:
+                model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            from safetensors.torch import load_file as safe_load_file
+
+            state_dict = safe_load_file(os.path.join(tmp_dirname, "adapter_model.safetensors"))
+            if save_embedding_layers in ["auto", True]:
+                self.assertTrue("base_model.model.embed_tokens.base_layer.weight" in state_dict)
+                self.assertTrue(
+                    torch.allclose(
+                        model.base_model.model.embed_tokens.base_layer.weight,
+                        state_dict["base_model.model.embed_tokens.base_layer.weight"],
+                    )
+                )
+            else:
+                self.assertFalse("base_model.model.embed_tokens.base_layer.weight" in state_dict)
+            del state_dict
+
+    @parameterized.expand(["auto", True, False])
+    def test_targeting_lora_to_embedding_layer_non_transformers(self, save_embedding_layers):
+        model = ModelEmbConv1D()
+        config = LoraConfig(target_modules=["emb", "lin0"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            if save_embedding_layers is True:
+                # assert warning
+                msg_start = "Could not identify embedding layer(s) because the model is not a 🤗 transformers model."
+                with self.assertWarns(UserWarning, msg=msg_start):
+                    model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            else:
+                model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            from safetensors.torch import load_file as safe_load_file
+
+            state_dict = safe_load_file(os.path.join(tmp_dirname, "adapter_model.safetensors"))
+            self.assertFalse("base_model.model.emb.base_layer.weight" in state_dict)
+            del state_dict
+
     @parameterized.expand(
         [
             LoraConfig(target_modules=["lin0"], init_lora_weights=False),

From 2b901ee57230559aaf39867c7698f6aca3617162 Mon Sep 17 00:00:00 2001
From: yxli2123 <69247082+yxli2123@users.noreply.github.com>
Date: Wed, 29 Nov 2023 11:08:17 -0500
Subject: [PATCH 62/65] Add LoftQ initialization method for LoRA (#1150)

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 README.md                                     |   1 +
 examples/loftq_finetuning/README.md           |  69 ++
 .../loftq_finetuning/quantize_save_load.py    | 244 +++++
 .../loftq_finetuning/train_gsm8k_llama.py     | 866 ++++++++++++++++++
 requirements.txt                              |  15 +
 src/peft/__init__.py                          |   1 +
 src/peft/tuners/__init__.py                   |   2 +-
 src/peft/tuners/lora/__init__.py              |   4 +-
 src/peft/tuners/lora/config.py                |  45 +-
 src/peft/tuners/lora/layer.py                 |  48 +-
 src/peft/tuners/lora/model.py                 |   4 +
 src/peft/utils/loftq_utils.py                 | 227 +++++
 12 files changed, 1514 insertions(+), 12 deletions(-)
 create mode 100644 examples/loftq_finetuning/README.md
 create mode 100644 examples/loftq_finetuning/quantize_save_load.py
 create mode 100644 examples/loftq_finetuning/train_gsm8k_llama.py
 create mode 100644 requirements.txt
 create mode 100644 src/peft/utils/loftq_utils.py

diff --git a/README.md b/README.md
index 445cb26539..79259f98ee 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ Supported methods:
 7. MultiTask Prompt Tuning: [Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning](https://arxiv.org/abs/2303.02861)
 8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
 9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
+10. LoftQ: [LoftQ: LoRA-Fine-Tuning-aware Quantization for Large Language Models](https://arxiv.org/abs/2310.08659)
 
 ## Getting started
 
diff --git a/examples/loftq_finetuning/README.md b/examples/loftq_finetuning/README.md
new file mode 100644
index 0000000000..726f544e85
--- /dev/null
+++ b/examples/loftq_finetuning/README.md
@@ -0,0 +1,69 @@
+# LoftQ: LoRA-fine-tuning-aware Quantization
+
+## Introduction
+
+LoftQ provides better initialization for LoRA adapters A and B, 
+and the Quantization of pre-trained weights W.
+
+## Quantization
+We recommend to save the quantized backbone model as fp16/fp32 
+and load it as [NormalFloat4](https://arxiv.org/abs/2305.14314).
+
+We provide a simple example to show how to quantize llama-2-7b model and save/load it.
+
+```sh
+python quantize_save_load.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --token HF_TOKEN \
+    --bits 4 --iter 5 --rank 16 \
+    --save_dir model_zoo/loftq/
+```
+
+- `HF_TOKEN` is the token used to access to [LLAMA models](https://huggingface.co/meta-llama).
+- `quantize_and_save()` function will quantize the backbone and initialize LoRA adapters. 
+It creates 2 folders under `$save_dir`. The quantized backbone is at `Llama-2-7b-hf-4bit-16rank`,
+and the LoRA adapters are at the sub-folder `Llama-2-7b-hf-4bit-16rank/loftq_init`.
+
+## Fine-tuning
+
+Here is an example to load the quantized backbone and LoRA adapters:
+
+```python
+import os
+
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+
+
+base_model = AutoModelForCausalLM.from_pretrained(
+    os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank"), 
+    load_in_4bit=True,
+)
+peft_model = PeftModel.from_pretrained(
+    base_model,
+    os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank", "loftq_init"),
+    is_trainable=True,
+)
+```
+
+We also provide an example to fine-tune LoftQ on GSM8K. 
+We load the quantized backbone and LoRA adapters from the [LoftQ Huggingface hub](https://huggingface.co/LoftQ).
+
+```sh
+python train_gsm8k_llama.py \
+    --model_name_or_path LoftQ/Llama-2-7b-hf-4bit-64rank \
+    --output_dir exp_results/gsm8k/llama-2-7b/bit4-rank64/lr3e-4 \
+    --learning_rate 3e-4  \
+    --seed 202 \
+    --dataset_name gsm8k \
+    --dataset_config main \
+    --pad_to_max_length \
+    --max_source_length 128 \
+    --max_target_length 256 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --with_tracking \
+    --report_to tensorboard
+```
diff --git a/examples/loftq_finetuning/quantize_save_load.py b/examples/loftq_finetuning/quantize_save_load.py
new file mode 100644
index 0000000000..3c47fa45cd
--- /dev/null
+++ b/examples/loftq_finetuning/quantize_save_load.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+
+from peft import LoftQConfig, LoraConfig, PeftModel, TaskType, get_peft_model
+
+
+class Shell(nn.Module):
+    def __init__(self, weight, bias=None):
+        super().__init__()
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        if bias is not None:
+            self.bias = nn.Parameter(bias, requires_grad=False)
+
+
+def unwarap_model(model, sub_module_name=".base_layer"):
+    sub_module_name_list = [k.split(sub_module_name)[0] for k in model.state_dict().keys() if sub_module_name in k]
+    sub_module_name_set = set(sub_module_name_list)
+    for name in sub_module_name_set:
+        # get the parent of the submodule
+        name_parent = ".".join(name.split(".")[:-1])
+        name_child = name.split(".")[-1]
+        sub_module = model.get_submodule(name_parent)
+        print(sub_module)
+
+        # replace with shell
+        child = getattr(sub_module, name_child)
+        weight = getattr(child.base_layer, "weight", None)
+        bias = getattr(child.base_layer, "bias", None)
+        shell = Shell(weight, bias)
+
+        setattr(sub_module, name_child, shell)
+
+    print("You have unwrapped the model. Use it on your own risk.")
+
+
+def print_model(model, name):
+    print("=" * 10 + name + "=" * 10)
+    print(model)
+    for name, param in model.named_parameters():
+        if torch.is_tensor(param):
+            if param.dtype in [torch.float32, torch.float16]:
+                print(
+                    name,
+                    param.shape,
+                    param.device,
+                    param.dtype,
+                    param.requires_grad,
+                    param.mean().item(),
+                    param.max().item(),
+                )
+            else:
+                print(name, param.shape, param.device, param.dtype, param.requires_grad)
+
+
+def arg_parse():
+    parser = argparse.ArgumentParser(description="Quantize a model with LoftQ.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="The name or path of the fp32/16 model.",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=None,
+        help="The access token to download model from HuggingFace Hub.",
+    )
+    parser.add_argument(
+        "--bits",
+        type=int,
+        default=4,
+        help="The quantized bits",
+    )
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=1,
+        help="The alternating steps in LoftQ",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=16,
+        help="The rank of the LoRA adapter",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="./model_zoo/loftq/",
+        help="The rank of the LoRA adapter",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def quantize_and_save():
+    args = arg_parse()
+
+    # Download weights and configure LoRA
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True)
+    if any(name in args.model_name_or_path.lower() for name in ["llama", "mistral", "falcon"]):
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, token=args.token, trust_remote_code=True, device_map="auto"
+        )
+        task_type = TaskType.CAUSAL_LM
+        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"]
+
+    elif any(name in args.model_name_or_path.lower() for name in ["bart", "t5"]):
+        model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path, token=args.token, device_map="auto")
+        task_type = TaskType.SEQ_2_SEQ_LM
+        target_modules = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "out_proj"]
+
+    elif any(name in args.model_name_or_path.lower() for name in ["deberta", "roberta", "bert"]):
+        model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, token=args.token)
+        model = model.cuda()
+        task_type = TaskType.SEQ_CLS
+        target_modules = ["query_proj", "key_proj", "value_proj", "dense"]  # embeddings not supported by peft
+    else:
+        raise NotImplementedError("Other models not supported yet.")
+
+    # Config of LoftQ
+    loftq_config = LoftQConfig(loftq_bits=args.bits, loftq_iter=args.iter)
+
+    lora_config = LoraConfig(
+        task_type=task_type,
+        inference_mode=True,
+        r=args.rank,
+        lora_alpha=16 if task_type is TaskType.CAUSAL_LM else args.rank,
+        lora_dropout=0.1,
+        target_modules=target_modules,
+        init_lora_weights="loftq",
+        loftq_config=loftq_config,
+    )
+
+    # Obtain LoftQ model
+    lora_model = get_peft_model(model, lora_config)
+    base_model = lora_model.get_base_model()
+
+    # Save LoftQ model
+    model_name = args.model_name_or_path.split("/")[-1] + f"-{args.bits}bit" + f"-{args.rank}rank"
+    base_model_dir = os.path.join(args.save_dir, model_name)
+    lora_model_dir = os.path.join(args.save_dir, model_name, "loft_init")
+
+    # save lora adapters first
+    lora_model.base_model.peft_config[
+        "default"
+    ].base_model_name_or_path = base_model_dir  # This can be a local path or Hub model id
+    lora_model.base_model.peft_config["default"].init_lora_weights = True  # Don't apply LoftQ when loading again
+
+    lora_model.save_pretrained(lora_model_dir)
+    print_model(lora_model, "lora_model")
+
+    # remove lora adapters and save the backbone
+    unwarap_model(base_model)
+    base_model.save_pretrained(base_model_dir)
+    tokenizer.save_pretrained(base_model_dir)
+
+    print_model(base_model, "base_model")
+
+    return base_model_dir, lora_model_dir
+
+
+def load_loftq(base_model_path, lora_adapter_path):
+    if any(name in base_model_path.lower() for name in ["llama", "mistral", "falcon"]):
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_path,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    elif any(name in base_model_path.lower() for name in ["bart", "t5"]):
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            base_model_path,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            load_in_4bit=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    elif any(name in base_model_path.lower() for name in ["deberta", "roberta", "bert"]):
+        model = AutoModelForSequenceClassification.from_pretrained(
+            base_model_path,
+            low_cpu_mem_usage=True,
+            load_in_4bit=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    else:
+        raise NotImplementedError("Other models not supported yet.")
+
+    lora_model = PeftModel.from_pretrained(model, lora_adapter_path, is_trainable=True)
+
+    # Do training or inference below
+    print_model(lora_model, "lora_model")
+    print_model(model, "base_model")
+
+
+if __name__ == "__main__":
+    base_dir, lora_dir = quantize_and_save()
+    load_loftq(base_dir, lora_dir)
+
+# example command:
+# python quantize_save_load.py \
+# --model_name_or_path meta-llama/Llama-2-7b-hf \
+# --token XXX \
+# --bits 4 --iter 5 --rank 16 \
+# --save_dir ./model_zoo/loftq/
diff --git a/examples/loftq_finetuning/train_gsm8k_llama.py b/examples/loftq_finetuning/train_gsm8k_llama.py
new file mode 100644
index 0000000000..e8c3580d2e
--- /dev/null
+++ b/examples/loftq_finetuning/train_gsm8k_llama.py
@@ -0,0 +1,866 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import logging
+import math
+import os
+import random
+import re
+from pathlib import Path
+
+import datasets
+import torch
+import transformers
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+)
+from transformers.utils import send_example_telemetry
+from transformers.utils.versions import require_version
+
+from peft import PeftModel
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.32.0.dev0")
+
+logger = get_logger(__name__)
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+HF_TOKEN = "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"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--ignore_pad_token_for_loss",
+        type=bool,
+        default=True,
+        help="Whether to ignore the tokens corresponding to padded labels in the loss computation or not.",
+    )
+    parser.add_argument(
+        "--max_source_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after "
+            "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded."
+        ),
+    )
+    parser.add_argument(
+        "--max_target_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total sequence length for target text after "
+            "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
+            "during ``evaluate`` and ``predict``."
+        ),
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help=(
+            "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+            "If passed, LLM loading time and RAM consumption will be benefited."
+        ),
+    )
+    ##########################
+    #   Generation Config    #
+    ##########################
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.8,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument("--k", type=int, default=40, help="Choose k candidate words")
+    parser.add_argument("--p", type=float, default=0.95, help="The sum of probability of candidate words is 0.9 ")
+
+    ##########################
+    #        Exp Args        #
+    ##########################
+    parser.add_argument(
+        "--adapter_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "The LoRA adapter checkpoint. Set None if you want to fine-tune from LoftQ."
+            "Specify a path if you want to evaluate."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_clm_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+                **dataset_args,
+            )
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(
+            args.config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path,
+            use_fast=not args.use_slow_tokenizer,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    ##########################
+    #        Tokenizer       #
+    ##########################
+    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
+    tokenizer.padding_side = "left"  # Allow batched inference
+    tokenizer.truncation_side = "left"
+
+    if args.model_name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            low_cpu_mem_usage=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=config.torch_dtype,
+            ),
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code)
+
+    ##########################
+    #       Peft Model       #
+    ##########################
+    if args.adapter_name_or_path is None:
+        model = PeftModel.from_pretrained(model, args.model_name_or_path, subfolder="loftq_init", is_trainable=True)
+    else:
+        model = PeftModel.from_pretrained(model, args.adapter_name_or_path, is_trainable=True)
+    model.print_trainable_parameters()
+
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    ##########################
+    #      GSM8K dataset     #
+    ##########################
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+
+    # Get the column names for source/target.
+    source_column, target_column = "question", "answer"
+
+    # Temporarily set max_target_length for training.
+    padding = "max_length" if args.pad_to_max_length else False
+    task_prompt = "\nAnswer the above question. First think step by step and then answer the final number.\n"
+
+    def prompt_process(sent_1, sent_2, prompt_1="", prompt_2="", prompt_3=""):
+        sent_2 = sent_2.replace("####", "The final answer is")
+        return prompt_1 + sent_1 + prompt_2 + sent_2 + prompt_3
+
+    def preprocess_function_train(examples):
+        sources = examples[source_column]
+        targets = examples[target_column]
+
+        inputs = [prompt_process(source, target, prompt_2=task_prompt) for (source, target) in zip(sources, targets)]
+
+        model_inputs = tokenizer(
+            inputs,
+            max_length=args.max_source_length + args.max_target_length,
+            padding=padding,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        labels = copy.deepcopy(model_inputs)
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and args.ignore_pad_token_for_loss:
+            # get the length of the target tokens. -1 to kick out the <BOS> token
+            target_tokens = tokenizer(targets, padding=False)
+            target_len = [len(label) - 1 for label in target_tokens["input_ids"]]
+
+            # don't calculate the loss from source and padding (left padding)
+            for i in range(len(labels["input_ids"])):
+                labels["input_ids"][i, : -target_len[i]] = -100
+
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+
+    def preprocess_function_test(examples):
+        sources = examples[source_column]
+        labels = examples[target_column]
+
+        inputs = [source + task_prompt for source in sources]
+
+        model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
+        labels = tokenizer(labels, max_length=args.max_target_length, padding=padding, truncation=True)
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    with accelerator.main_process_first():
+        train_dataset = raw_datasets["train"].map(
+            preprocess_function_train,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on training dataset",
+        )
+
+        eval_dataset = raw_datasets["test"].map(
+            preprocess_function_test,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on test dataset",
+        )
+
+    # Log a few random samples from the set:
+    for index in random.sample(range(len(train_dataset)), 2):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    for index in random.sample(range(len(eval_dataset)), 2):
+        logger.info(f"Sample {index} of the validation set: {eval_dataset[index]}.")
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and "lora" in n],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("clm_no_trainer", experiment_config)
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            checkpoint_path = args.resume_from_checkpoint
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                accelerator.print(f"Epoch: {epoch} | Step: {step} | Loss: {loss}")
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        gen_kwargs = {
+            "max_new_tokens": args.max_target_length,
+            "temperature": args.temperature,
+            "top_k": args.k,
+            "top_p": args.p,
+            "do_sample": True,
+        }
+        ans_pred_list = []
+        ans_gold_list = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                gen_kwargs["input_ids"] = batch["input_ids"]
+                gen_kwargs["attention_mask"] = batch["attention_mask"]
+                generated_tokens = accelerator.unwrap_model(model).generate(**gen_kwargs)
+
+            pred_tokens = generated_tokens[:, args.max_source_length :]
+            pred_tokens = accelerator.pad_across_processes(pred_tokens, dim=1, pad_index=tokenizer.pad_token_id)
+            gold_tokens = batch["labels"]
+
+            if not args.pad_to_max_length:
+                # If we did not pad to max length, we need to pad the labels too
+                gold_tokens = accelerator.pad_across_processes(
+                    batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
+                )
+
+            pred_tokens, gold_tokens = accelerator.gather_for_metrics((pred_tokens, gold_tokens))
+            pred_tokens, gold_tokens = pred_tokens.cpu().numpy(), gold_tokens.cpu().numpy()
+
+            if isinstance(pred_tokens, tuple):
+                pred_tokens = pred_tokens[0]
+            decoded_pred = tokenizer.batch_decode(pred_tokens, skip_special_tokens=True)
+            decoded_gold = tokenizer.batch_decode(gold_tokens, skip_special_tokens=True)
+
+            # Extract the numbers in sentences
+            accelerator.print(decoded_pred)
+            ans_pred_list += [extract_answer_number(sentence_pred) for sentence_pred in decoded_pred]
+            ans_gold_list += [extract_answer_number(sentence_gold) for sentence_gold in decoded_gold]
+
+        accelerator.print(ans_pred_list)
+        accelerator.print(ans_gold_list)
+        accuracy = compute_accuracy(ans_gold_list, ans_pred_list)
+
+        logger.info(f"epoch {epoch}: accuracy: {accuracy}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "accuracy": accuracy,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+        )
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+
+
+PATTERN_NUMBER = re.compile(r"-?\d+\.?\d*")
+
+
+def extract_answer_number(sentence: str) -> float:
+    sentence = sentence.replace(",", "")
+    pred = PATTERN_NUMBER.findall(sentence)
+    if not pred:
+        return float("inf")
+    segment = sentence.split("The final answer is ")
+    if len(segment) > 1:
+        pred_answer = segment[1]
+        pred_answer = PATTERN_NUMBER.findall(pred_answer)
+        if len(pred_answer) > 0:
+            pred_answer = pred_answer[0]
+        else:
+            pred_answer = float(pred[-1])
+    else:
+        pred_answer = float(pred[-1])
+
+    if isinstance(pred_answer, str):
+        try:
+            pred_answer = float(pred_answer)
+        except ValueError:
+            pred_answer = float("inf")
+    return pred_answer
+
+
+def compute_accuracy(pred: list, gold: list):
+    acc = 0.0
+    for p, g in zip(pred, gold):
+        if p == g:
+            acc += 1
+
+    return acc / len(pred)
+
+
+if __name__ == "__main__":
+    main()
+
+# example command
+
+# python train_gsm8k_llama.py \
+# --model_name_or_path LoftQ/Llama-2-7b-hf-bit4-rank64-backbone \
+# --adapter_name_or_path LoftQ/Llama-2-7b-hf-bit4-rank64-adapters \
+# --output_dir exp_results/gsm8k/llama-2-7b/bit4-rank64/lr3e-4 \
+# --learning_rate 1e-4  \
+# --seed 202 \
+# --dataset_name gsm8k \
+# --dataset_config main \
+# --pad_to_max_length \
+# --max_source_length 128 \
+# --max_target_length 256 \
+# --num_train_epochs 5 \
+# --per_device_train_batch_size 4 \
+# --per_device_eval_batch_size 4 \
+# --gradient_accumulation_steps 4 \
+# --with_tracking \
+# --report_to tensorboard
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..dca857de32
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+accelerate
+torch
+safetensors
+bitsandbytes
+scipy
+peft
+transformers
+tqdm
+packaging
+pytest
+numpy
+pyyaml
+datasets
+psutil
+setuptools
\ No newline at end of file
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index a3ce332f24..4d9380e697 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -48,6 +48,7 @@
     AdaptionPromptConfig,
     AdaptionPromptModel,
     LoraConfig,
+    LoftQConfig,
     LoraModel,
     LoHaConfig,
     LoHaModel,
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index b357d47dc1..666e29d997 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -18,7 +18,7 @@
 # limitations under the License.
 
 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
-from .lora import LoraConfig, LoraModel
+from .lora import LoraConfig, LoraModel, LoftQConfig
 from .loha import LoHaConfig, LoHaModel
 from .lokr import LoKrConfig, LoKrModel
 from .ia3 import IA3Config, IA3Model
diff --git a/src/peft/tuners/lora/__init__.py b/src/peft/tuners/lora/__init__.py
index d02bf2d948..ddc81d53cd 100644
--- a/src/peft/tuners/lora/__init__.py
+++ b/src/peft/tuners/lora/__init__.py
@@ -15,13 +15,13 @@
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
 
-from .config import LoraConfig
+from .config import LoftQConfig, LoraConfig
 from .gptq import QuantLinear
 from .layer import Conv2d, Embedding, Linear, LoraLayer
 from .model import LoraModel
 
 
-__all__ = ["LoraConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
+__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
 
 
 if is_bnb_available():
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index b1e31d8198..0dcca5c1e6 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -22,6 +22,25 @@
 from peft.utils import PeftType
 
 
+@dataclass
+class LoftQConfig:
+    """
+    This is the sub-configuration class to store the configuration of a [`LoraModel`].
+
+    Args:
+        bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the
+            default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}.
+        bits (`int`): Quantization bits for LoftQ.
+        iter (`int`): Alternating iterations for LoftQ.
+        fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear
+            models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4
+            bits.
+    """
+
+    loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"})
+    loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"})
+
+
 @dataclass
 class LoraConfig(PeftConfig):
     """
@@ -78,7 +97,7 @@ class LoraConfig(PeftConfig):
             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
         },
     )
-    init_lora_weights: bool | Literal["gaussian"] = field(
+    init_lora_weights: bool | Literal["gaussian", "loftq"] = field(
         default=True,
         metadata={
             "help": (
@@ -86,6 +105,7 @@ class LoraConfig(PeftConfig):
                 "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
                 "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
                 "to False leads to completely random initialization and is discouraged."
+                "Pass `'loftq'` to use LoftQ initialization"
             ),
         },
     )
@@ -121,6 +141,16 @@ class LoraConfig(PeftConfig):
             )
         },
     )
+    # dict type is used when loading config.json
+    loftq_config: Union[LoftQConfig, dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone "
+                "weights and initialize Lora layers."
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.LORA
@@ -134,3 +164,16 @@ def __post_init__(self):
         # if target_modules is a regex expression, then layers_pattern should be None
         if isinstance(self.target_modules, str) and self.layers_pattern is not None:
             raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
+
+        # handle init_lora_weights and loftq_config
+        if self.init_lora_weights == "loftq":
+            import importlib
+
+            if not importlib.util.find_spec("scipy"):
+                raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+            if self.loftq_config is None:
+                raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.")
+
+        # convert loftq_config to dict
+        if self.loftq_config is not None and not isinstance(self.loftq_config, dict):
+            self.loftq_config = vars(self.loftq_config)
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index 5ea726d2ff..cf97108c87 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -15,7 +15,7 @@
 
 import math
 import warnings
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -46,6 +46,7 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         # Mark the weight as unmerged
         self._disable_adapters = False
         self.merged_adapters = []
+        self.kwargs = kwargs
 
         base_layer = self.get_base_layer()
         if isinstance(base_layer, nn.Linear):
@@ -83,7 +84,10 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
             self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
             self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
             self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(self.get_base_layer(), "weight", None)
@@ -115,7 +119,10 @@ def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lo
             self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
             self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
             self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(base_layer, "weight", None)
@@ -142,7 +149,11 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
             self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
             self.scaling[adapter_name] = lora_alpha / r
-        self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         base_layer = self.get_base_layer()
         weight = getattr(base_layer, "weight", None)
@@ -170,6 +181,27 @@ def reset_lora_parameters(self, adapter_name, init_lora_weights):
             nn.init.zeros_(self.lora_embedding_A[adapter_name])
             nn.init.normal_(self.lora_embedding_B[adapter_name])
 
+    def loftq_init(self, adapter_name):
+        from peft.utils.loftq_utils import loftq_init
+
+        weight = self.get_base_layer().weight
+        kwargs = {
+            "num_bits": self.kwargs.get("loftq_bits", 4),
+            "reduced_rank": self.r[adapter_name],
+            "num_iter": self.kwargs.get("loftq_iter", 1),
+        }
+
+        qweight, lora_A, lora_B = loftq_init(weight, **kwargs)
+        if adapter_name in self.lora_A.keys():
+            # initialize A the same way as the default for nn.Linear and B to zero
+            self.lora_A[adapter_name].weight.data = lora_A
+            self.lora_B[adapter_name].weight.data = lora_B
+        if adapter_name in self.lora_embedding_A.keys():
+            # initialize a the same way as the default for nn.linear and b to zero
+            self.lora_embedding_A[adapter_name].weight.data = lora_A
+            self.lora_embedding_B[adapter_name].weight.data = lora_B
+        self.get_base_layer().weight.data = qweight
+
     def set_scale(self, adapter, scale):
         if adapter not in self.scaling:
             # Ignore the case where the adapter is not in the layer
@@ -218,11 +250,11 @@ def __init__(
         lora_dropout: float = 0.0,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_target_conv_1d_layer: bool = False,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
-        LoraLayer.__init__(self, base_layer)
+        LoraLayer.__init__(self, base_layer, **kwargs)
         self.fan_in_fan_out = fan_in_fan_out
 
         self._active_adapter = adapter_name
@@ -351,7 +383,7 @@ def __init__(
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -491,7 +523,7 @@ def __init__(
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 653a684276..6e0a64187a 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -286,8 +286,10 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
         elif isinstance(target_base_layer, torch.nn.Embedding):
             embedding_kwargs = kwargs.copy()
             embedding_kwargs.pop("fan_in_fan_out", None)
+            embedding_kwargs.update(lora_config.loftq_config)
             new_module = Embedding(target, adapter_name, **embedding_kwargs)
         elif isinstance(target_base_layer, torch.nn.Conv2d):
+            kwargs.update(lora_config.loftq_config)
             new_module = Conv2d(target, adapter_name, **kwargs)
         elif isinstance(target_base_layer, torch.nn.Linear):
             if kwargs["fan_in_fan_out"]:
@@ -296,6 +298,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "Setting fan_in_fan_out to False."
                 )
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, **kwargs)
         elif isinstance(target_base_layer, Conv1D):
             if not kwargs["fan_in_fan_out"]:
@@ -304,6 +307,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "Setting fan_in_fan_out to True."
                 )
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs)
         else:
             raise ValueError(
diff --git a/src/peft/utils/loftq_utils.py b/src/peft/utils/loftq_utils.py
new file mode 100644
index 0000000000..81ff1e2c34
--- /dev/null
+++ b/src/peft/utils/loftq_utils.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reference code: https://github.com/yxli2123/LoftQ/blob/main/utils.py
+# Reference paper: https://arxiv.org/abs/2310.08659
+
+import logging
+from typing import Union
+
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+
+class NFQuantizer:
+    def __init__(self, num_bits=2, device="cuda", method="normal", block_size=64, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_bits = num_bits
+        self.device = device
+        self.method = method
+        self.block_size = block_size
+        if self.method == "normal":
+            self.norm_lookup_table = self.create_normal_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        elif self.method == "uniform":
+            self.norm_lookup_table = self.create_uniform_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        else:
+            raise NotImplementedError("Other quantization methods not supported yet.")
+
+    @staticmethod
+    def create_uniform_map(symmetric=False, num_bits=4):
+        if symmetric:
+            # print("symmetric uniform quantization")
+            negative = torch.linspace(-1, 0, 2 ** (num_bits - 1))
+            positive = torch.linspace(0, 1, 2 ** (num_bits - 1))
+            table = torch.cat([negative, positive[1:]])
+        else:
+            # print("asymmetric uniform quantization")
+            table = torch.linspace(-1, 1, 2**num_bits)
+        return table
+
+    @staticmethod
+    def create_normal_map(offset=0.9677083, symmetric=False, num_bits=2):
+        try:
+            from scipy.stats import norm
+        except ImportError:
+            raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+
+        variations = 2**num_bits
+        if symmetric:
+            v = norm.ppf(torch.linspace(1 - offset, offset, variations + 1)).tolist()
+            values = []
+            for index in range(len(v) - 1):
+                values.append(0.5 * v[index] + 0.5 * v[index + 1])
+            v = values
+        else:
+            # one more positive value, this is an asymmetric type
+            v1 = norm.ppf(torch.linspace(offset, 0.5, variations // 2 + 1)[:-1]).tolist()
+            v2 = [0]
+            v3 = (-norm.ppf(torch.linspace(offset, 0.5, variations // 2)[:-1])).tolist()
+            v = v1 + v2 + v3
+
+        values = torch.Tensor(v)
+        values = values.sort().values
+        values /= values.max()
+        return values
+
+    def quantize_tensor(self, weight):
+        max_abs = torch.abs(weight).max()
+        weight_normed = weight / max_abs
+
+        weight_normed_expanded = weight_normed.unsqueeze(-1)
+
+        # Reshape L to have the same number of dimensions as X_expanded
+        L_reshaped = torch.tensor(self.norm_lookup_table).reshape(1, -1)
+
+        # Calculate the absolute difference between X_expanded and L_reshaped
+        abs_diff = torch.abs(weight_normed_expanded - L_reshaped)
+
+        # Find the index of the minimum absolute difference for each element
+        qweight = torch.argmin(abs_diff, dim=-1)
+        return qweight, max_abs
+
+    def dequantize_tensor(self, qweight, max_abs):
+        qweight_flatten = qweight.flatten()
+
+        weight_normed = self.norm_lookup_table[qweight_flatten]
+        weight = weight_normed * max_abs
+
+        weight = weight.reshape(qweight.shape)
+
+        return weight
+
+    def quantize_block(self, weight):
+        if len(weight.shape) != 2:
+            raise ValueError(f"Only support 2D matrix, but your input has {len(weight.shape)} dimensions.")
+        if weight.shape[0] * weight.shape[1] % self.block_size != 0:
+            raise ValueError(
+                f"Weight with shape ({weight.shape[0]} x {weight.shape[1]}) "
+                f"is not dividable by block size {self.block_size}."
+            )
+
+        M, N = weight.shape
+        device = weight.device
+
+        # Quantization
+        weight_flatten = weight.flatten()  # (M*N, )
+        weight_block = weight_flatten.reshape(-1, self.block_size)  # (L, B), L = M * N / B
+        if self.method == "normal":
+            weight_max = weight_block.abs().max(dim=-1)[0]  # (L, 1)
+        elif self.method == "uniform":
+            weight_max = weight_block.mean(dim=-1) + 2.5 * weight_block.std(dim=-1)
+        else:
+            raise NotImplementedError("Method not supported yet.")
+        weight_max = weight_max.unsqueeze(-1)
+        weight_divabs = weight_block / weight_max  # (L, B)
+        weight_divabs = weight_divabs.unsqueeze(-1)  # (L, B, 1)
+        L_reshaped = self.norm_lookup_table.reshape(1, -1)  # (1, 2**K)
+
+        abs_diff = torch.abs(weight_divabs - L_reshaped)  # (L, B, 2**K)
+        qweight = torch.argmin(abs_diff, dim=-1)  # (L, B)
+
+        # Pack multiple k-bit into uint8
+        qweight = qweight.reshape(-1, 8 // self.num_bits)
+        qweight_pack = torch.zeros((M * N // 8 * self.num_bits, 1), dtype=torch.uint8, device=device)
+
+        # data format example:
+        # [1, 0, 3, 2] or [01, 00, 11, 10]  -> [10110001], LIFO
+        for i in range(8 // self.num_bits):
+            qweight[:, i] = qweight[:, i] << i * self.num_bits
+            qweight_pack[:, 0] |= qweight[:, i]
+
+        return qweight_pack, weight_max, weight.shape
+
+    def dequantize_block(self, qweight, weight_max, weight_shape):
+        # unpack weight
+        device = qweight.device
+        weight = torch.zeros((qweight.shape[0], 8 // self.num_bits), dtype=torch.float32, device=device)
+        for i in range(8 // self.num_bits):
+            lookup_table_idx = qweight.to(torch.long) % 2**self.num_bits  # get the most right 2 bits
+            lookup_table_idx = lookup_table_idx.to(torch.int)
+            weight[:, i] = self.norm_lookup_table[lookup_table_idx].squeeze()
+            qweight = qweight >> self.num_bits  # right shift 2 bits of the original data
+
+        weight_block = weight.reshape(-1, self.block_size)
+        weight = weight_block * weight_max
+        weight = weight.reshape(weight_shape)
+
+        return weight
+
+
+def _low_rank_decomposition(weight, reduced_rank=32):
+    """
+    :param weight: The matrix to decompose, of shape (H, W) :param reduced_rank: the final rank :return:
+    """
+    matrix_dimension = len(weight.size())
+    if matrix_dimension != 2:
+        raise ValueError(f"Only support 2D matrix, but your input has {matrix_dimension} dimensions.")
+
+    # Use SVD to decompose a matrix, default full_matrices is False to save parameters
+    U, S, Vh = torch.linalg.svd(weight, full_matrices=False)
+
+    L = U @ (torch.sqrt(torch.diag(S)[:, 0:reduced_rank]))
+    R = torch.sqrt(torch.diag(S)[0:reduced_rank, :]) @ Vh
+
+    return {"L": L, "R": R, "U": U, "S": S, "Vh": Vh, "reduced_rank": reduced_rank}
+
+
+@torch.no_grad()
+def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, reduced_rank: int, num_iter=1):
+    if num_bits not in [2, 4, 8]:
+        raise ValueError("Only support 2, 4, 8 bits quantization")
+    if num_iter <= 0:
+        raise ValueError("Number of iterations must be greater than 0")
+
+    out_feature, in_feature = weight.size()
+    device = weight.device
+    dtype = weight.dtype
+
+    logging.info(
+        f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} "
+        f"| Num Iter: {num_iter} | Num Bits: {num_bits}"
+    )
+    if not is_bnb_4bit_available():
+        quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64)
+
+    weight = weight.to(torch.float32)
+    res = weight.clone()
+    for i in range(num_iter):
+        torch.cuda.empty_cache()
+        # Quantization
+        if num_bits == 4 and is_bnb_4bit_available():
+            qweight = bnb.nn.Params4bit(
+                res.to("cpu"), requires_grad=False, compress_statistics=False, quant_type="nf4"
+            ).to(device)
+            dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state)
+        else:
+            quantized_weight, max_abs, shape = quantizer.quantize_block(res)
+            dequantized_weight = quantizer.dequantize_block(quantized_weight, max_abs, shape)
+
+        res = weight - dequantized_weight
+
+        # Decompose the residual by SVD
+        output = _low_rank_decomposition(res, reduced_rank=reduced_rank)
+        L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"]
+        res = weight - torch.mm(L, R)
+
+    lora_A, lora_B = R, L
+
+    return dequantized_weight.to(dtype), lora_A, lora_B

From 2674f5ea66b43e07f08dabe2634aa9542d979211 Mon Sep 17 00:00:00 2001
From: zhangshengdong29 <435878393@qq.com>
Date: Thu, 30 Nov 2023 23:24:58 +0800
Subject: [PATCH 63/65] Megatron distributed parallel linear LoRA (#1092)

Adds option to use Megatron's ColumnParallelLinear and RowParallelLinear
for LoRA linear layers, leading to improved performance when using LoRA
with Megatron.
---
 src/peft/tuners/lora/config.py   |  26 +++++
 src/peft/tuners/lora/layer.py    |   3 +
 src/peft/tuners/lora/model.py    |  27 +++++
 src/peft/tuners/lora/tp_layer.py | 158 +++++++++++++++++++++++++++++
 tests/test_lora_megatron.py      | 167 +++++++++++++++++++++++++++++++
 5 files changed, 381 insertions(+)
 create mode 100644 src/peft/tuners/lora/tp_layer.py
 create mode 100644 tests/test_lora_megatron.py

diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 0dcca5c1e6..53269ebb8d 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -141,6 +141,32 @@ class LoraConfig(PeftConfig):
             )
         },
     )
+    megatron_config: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The TransformerConfig from Megatron, it is used to create LoRA's parallel linear layer."
+                "You can get it like this, `core_transformer_config_from_args(get_args())`, "
+                "this two functions are from Megatron."
+                "You need to specify this parameter when you want to loraize the ColumnParallelLinear and "
+                "RowParallelLinear layers of megatron."
+                "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` "
+                "functions, because TransformerConfig may not necessarily be serialized."
+                "But when using megatron, we can use `get_peft_model_state_dict` function and "
+                "megatron's framework, they can also save and load models and configurations."
+            )
+        },
+    )
+    megatron_core: Optional[str] = field(
+        default="megatron.core",
+        metadata={
+            "help": (
+                "The core module from Megatron, it is used to judge and create LoRA's parallel linear layer. "
+                "It only needs to be passed in when you need to use your own modified megatron core module. "
+                "Otherwise, it will use the default value `megatron.core`. "
+            )
+        },
+    )
     # dict type is used when loading config.json
     loftq_config: Union[LoftQConfig, dict] = field(
         default_factory=dict,
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index cf97108c87..3219ca1e47 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -62,6 +62,9 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"):
             # QuantLinear
             in_features, out_features = base_layer.infeatures, base_layer.outfeatures
+        elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"):
+            # Megatron ColumnParallelLinear,RowParallelLinear
+            in_features, out_features = base_layer.input_size, base_layer.output_size
         else:
             raise ValueError(f"Unsupported layer type {type(base_layer)}")
 
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 6e0a64187a..4f6538e912 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import math
 import operator
 import re
@@ -259,6 +260,10 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
         else:
             target_base_layer = target
 
+        megatron_core = None
+        if lora_config.megatron_config:
+            megatron_core = importlib.import_module(lora_config.megatron_core)
+
         if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
             eightbit_kwargs = kwargs.copy()
             eightbit_kwargs.update(
@@ -300,6 +305,28 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
             kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, **kwargs)
+        elif megatron_core and isinstance(
+            target_base_layer,
+            (megatron_core.tensor_parallel.ColumnParallelLinear, megatron_core.tensor_parallel.RowParallelLinear),
+        ):
+            from .tp_layer import LoraParallelLinear
+
+            megatron_kwargs = kwargs.copy()
+            megatron_config = lora_config.megatron_config
+            if isinstance(megatron_config, dict):
+                transformer_config_class = megatron_core.transformer.transformer_config.TransformerConfig
+                megatron_config = transformer_config_class(**lora_config.megatron_config)
+            megatron_kwargs["megatron_config"] = megatron_config
+            if megatron_kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `ColumnParallelLinear` "
+                    "or `RowParallelLinear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                megatron_kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            new_module = LoraParallelLinear(
+                base_layer=target, adapter_name=adapter_name, backend=megatron_core.tensor_parallel, **megatron_kwargs
+            )
         elif isinstance(target_base_layer, Conv1D):
             if not kwargs["fan_in_fan_out"]:
                 warnings.warn(
diff --git a/src/peft/tuners/lora/tp_layer.py b/src/peft/tuners/lora/tp_layer.py
new file mode 100644
index 0000000000..676430cf38
--- /dev/null
+++ b/src/peft/tuners/lora/tp_layer.py
@@ -0,0 +1,158 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from .layer import LoraLayer
+
+
+class LoraParallelLinear(nn.Module, LoraLayer):
+    """
+    When the target layer parallel_linear is RowParallelLinear, in order to keep the input and output shapes
+    consistent, we need to split the lora matrix A into rows, and the lora_B at this time should be a complete linear
+    layer; In the same way, when the target layer is ColumnParallelLinear, we perform column segmentation on lora_B,
+    while lora_A is still a complete linear layer.
+    """
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        backend,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,
+        init_lora_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer=base_layer)
+
+        self.backend = backend
+        self.is_paralle_a = isinstance(base_layer, backend.RowParallelLinear)
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+
+        megatron_config = kwargs["megatron_config"]
+        parallel_linear_kwargs = {"megatron_config": megatron_config}
+        init_method = init.xavier_normal_
+        if hasattr(megatron_config, "init_method"):
+            init_method = megatron_config.init_method
+        input_is_parallel = True
+        gather_output = False
+        if isinstance(base_layer, self.backend.RowParallelLinear):
+            input_is_parallel = base_layer.input_is_parallel
+        else:
+            gather_output = base_layer.gather_output
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha,
+            lora_dropout,
+            init_lora_weights,
+            init_method,
+            input_is_parallel,
+            gather_output,
+            **parallel_linear_kwargs,
+        )
+
+        self.is_target_conv_1d_layer = False
+
+    def update_layer(
+        self,
+        adapter_name,
+        r,
+        lora_alpha,
+        lora_dropout,
+        init_lora_weights,
+        init_method=init.xavier_normal_,
+        input_is_parallel=True,
+        gather_output=False,
+        **parallel_linear_kwargs,
+    ):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+
+        megatron_config = parallel_linear_kwargs["megatron_config"]
+        # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow
+        megatron_config.params_dtype = torch.float32
+        if self.is_paralle_a:
+            lora_a = self.backend.RowParallelLinear(
+                input_size=self.in_features,
+                output_size=r,
+                bias=False,
+                input_is_parallel=input_is_parallel,
+                skip_bias_add=True,
+                init_method=init_method,
+                config=megatron_config,
+            )
+            lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32)
+        else:
+            lora_a = nn.Linear(in_features=self.in_features, out_features=r, bias=False, dtype=torch.float32)
+            lora_b = self.backend.ColumnParallelLinear(
+                input_size=r,
+                output_size=self.out_features,
+                bias=False,
+                gather_output=gather_output,
+                init_method=init_method,
+                config=megatron_config,
+            )
+        self.lora_A[adapter_name] = lora_a
+        self.lora_B[adapter_name] = lora_b
+        self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
+        previous_dtype = x.dtype
+        # If weight is used for matrix multiplication here, the final aggregation operation of the original
+        # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
+        # output of the original parallel_linear layer.
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result, bias = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result, bias = self.base_layer(x, *args, **kwargs)
+        else:
+            result, bias = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+
+                lora_result = lora_A(dropout(x))
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_B(lora_result)
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_result * scaling
+
+                result = result + lora_result
+
+        result = result.to(previous_dtype)
+        return result, bias
diff --git a/tests/test_lora_megatron.py b/tests/test_lora_megatron.py
new file mode 100644
index 0000000000..80d0f43010
--- /dev/null
+++ b/tests/test_lora_megatron.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+import os
+import unittest
+
+import torch
+import torch.nn.init as init
+
+from peft import LoraConfig, PeftModel, get_peft_model, get_peft_model_state_dict
+
+
+def is_megatron_available() -> bool:
+    return importlib.util.find_spec("megatron") is not None
+
+
+if is_megatron_available():
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    world_size = 1
+    rank = 0
+
+    def initialize_distributed():
+        print(f"Initializing torch.distributed with rank: {rank}, world_size: {world_size}")
+        torch.cuda.set_device(0)
+        init_method = "tcp://"
+        master_ip = os.getenv("MASTER_ADDR", "localhost")
+        master_port = os.getenv("MASTER_PORT", "6001")
+        init_method += master_ip + ":" + master_port
+        torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank, init_method=init_method)
+
+    def destroy_model_parallel():
+        parallel_state.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+    ):
+        parallel_state.destroy_model_parallel()
+        if not torch.distributed.is_initialized():
+            initialize_distributed()
+        parallel_state.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+        )
+
+    class DummyModule(MegatronModule):
+        def __init__(self, config: TransformerConfig):
+            super().__init__(config)
+            self.linear = tensor_parallel.ColumnParallelLinear(
+                input_size=10,
+                output_size=10,
+                config=config,
+                init_method=init.xavier_normal_,
+                bias=False,
+                gather_output=False,
+            )
+            self.lm_head = tensor_parallel.RowParallelLinear(
+                input_size=10,
+                output_size=10,
+                config=config,
+                init_method=init.xavier_normal_,
+                bias=False,
+                input_is_parallel=True,
+            )
+
+        def forward(self, input):
+            x = self.linear(input)[0]
+            x = self.lm_head(x)[0]
+            return x
+
+    class TestMegatronLora(unittest.TestCase):
+        def setUp(self):
+            initialize_model_parallel(1, 1)
+            model_parallel_cuda_manual_seed(123)
+            transformer_config = {
+                "num_layers": 2,
+                "hidden_size": 12,
+                "num_attention_heads": 4,
+                "use_cpu_initialization": True,
+            }
+            config = TransformerConfig(**transformer_config)
+            self.megatron_module = DummyModule(config=config).cuda()
+            self.dummy_module = copy.deepcopy(self.megatron_module).cuda()
+
+            lora_config = LoraConfig(
+                lora_alpha=16,
+                lora_dropout=0.1,
+                r=64,
+                bias="none",
+                target_modules=["linear", "lm_head"],
+                megatron_config=config,
+                megatron_core="megatron.core",
+            )
+            self.megatron_module = get_peft_model(self.megatron_module, lora_config)
+
+        def tearDown(self):
+            destroy_model_parallel()
+
+        def test_megatron_lora_module(self):
+            megatron_module = self.megatron_module
+            self.assertTrue(isinstance(megatron_module, PeftModel))
+
+            for name, module in megatron_module.named_modules():
+                if name.endswith("linear"):
+                    self.assertTrue(hasattr(module, "lora_A"))
+                    self.assertTrue(hasattr(module, "lora_B"))
+                if name.endswith("linear.lora_A.default"):
+                    self.assertTrue(isinstance(module, torch.nn.Linear))
+                if name.endswith("linear.lora_B.default"):
+                    self.assertTrue(isinstance(module, tensor_parallel.ColumnParallelLinear))
+
+                if name.endswith("lm_head.lora_A.default"):
+                    self.assertTrue(isinstance(module, tensor_parallel.RowParallelLinear))
+                if name.endswith("lm_head.lora_B.default"):
+                    self.assertTrue(isinstance(module, torch.nn.Linear))
+
+        def test_forward(self):
+            x = torch.ones((2, 4, 10)).cuda()
+            megatron_module_result = self.megatron_module(x)
+            dummt_module_result = self.dummy_module(x)
+
+            # Because lora_B is initialized with 0, the forward results of two models should be equal before backward.
+            self.assertTrue(megatron_module_result.equal(dummt_module_result))
+
+        def test_backward(self):
+            optimizer = torch.optim.AdamW(self.megatron_module.parameters())
+            loss_fn = torch.nn.CrossEntropyLoss()
+
+            x = torch.randn(2, 4, 10, requires_grad=True).cuda()
+            label = torch.randint(10, (2 * 4,)).cuda()
+
+            output = self.megatron_module(x)
+            output = output.reshape(2 * 4, 10)
+            loss = loss_fn(output, label)
+
+            loss.backward()
+            optimizer.step()
+
+        def test_get_peft_model_state_dict(self):
+            peft_state_dict = get_peft_model_state_dict(self.megatron_module)
+
+            for key in peft_state_dict.keys():
+                self.assertTrue("lora" in key)

From da17ac0f484b28a8471004b47bddfc408969ae04 Mon Sep 17 00:00:00 2001
From: takuoko <to78314910@gmail.com>
Date: Fri, 1 Dec 2023 00:58:42 +0900
Subject: [PATCH 64/65] [Feature] Support OFT (#1160)

* Support OFT

* add test

* Update README

* fix code quality

* fix test

* Skip 1 test

* fix eps rule and add more test

* feat: added examples to new OFT method

* fix: removed wrong arguments from model example

* fix: changed name of inference file

* fix: changed prompt variable

* fix docs

* fix: dreambooth inference revision based on feedback

* fix: review from BenjaminBossan

* apply safe merge

* del partially

* refactor oft

* refactor oft

* del unused line

* del unused line

* fix skip in windows

* skip test

* Add comments about bias added place

* rename orig_weights to new_weights

* use inverse instead of linalg.inv

* delete alpha and scaling

---------

Co-authored-by: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Co-authored-by: Lukas Kuhn <lukas.kuhn@deutschebahn.com>
---
 README.md                                     |    7 +-
 .../oft_dreambooth_inference.ipynb            |   89 ++
 examples/oft_dreambooth/train_dreambooth.py   | 1112 +++++++++++++++++
 src/peft/__init__.py                          |    2 +
 src/peft/mapping.py                           |    4 +
 src/peft/peft_model.py                        |    2 +
 src/peft/tuners/__init__.py                   |    1 +
 src/peft/tuners/oft/__init__.py               |   21 +
 src/peft/tuners/oft/config.py                 |  109 ++
 src/peft/tuners/oft/layer.py                  |  375 ++++++
 src/peft/tuners/oft/model.py                  |  108 ++
 src/peft/utils/peft_types.py                  |    1 +
 src/peft/utils/save_and_load.py               |    5 +-
 tests/test_config.py                          |    4 +-
 tests/test_custom_models.py                   |  103 +-
 tests/test_stablediffusion.py                 |   23 +-
 tests/testing_common.py                       |    6 +-
 17 files changed, 1959 insertions(+), 13 deletions(-)
 create mode 100644 examples/oft_dreambooth/oft_dreambooth_inference.ipynb
 create mode 100644 examples/oft_dreambooth/train_dreambooth.py
 create mode 100644 src/peft/tuners/oft/__init__.py
 create mode 100644 src/peft/tuners/oft/config.py
 create mode 100644 src/peft/tuners/oft/layer.py
 create mode 100644 src/peft/tuners/oft/model.py

diff --git a/README.md b/README.md
index 79259f98ee..09846dc61c 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Supported methods:
 8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
 9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
 10. LoftQ: [LoftQ: LoRA-Fine-Tuning-aware Quantization for Large Language Models](https://arxiv.org/abs/2310.08659)
+11. OFT: [Controlling Text-to-Image Diffusion by Orthogonal Finetuning](https://arxiv.org/abs/2306.07280)
 
 ## Getting started
 
@@ -278,9 +279,9 @@ Find models that are supported out of the box below. Note that PEFT works with a
 
 ### Text-to-Image Generation
 
-|   Model         | LoRA | LoHa | LoKr | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
-| --------- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
-| Stable Diffusion           | ✅  | ✅  | ✅  |  |   |   |
+|   Model         | LoRA | LoHa | LoKr | OFT | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  | ✅  | ✅  | ✅  |  |   |   |
 
 
 ### Image Classification
diff --git a/examples/oft_dreambooth/oft_dreambooth_inference.ipynb b/examples/oft_dreambooth/oft_dreambooth_inference.ipynb
new file mode 100644
index 0000000000..4a28c4040e
--- /dev/null
+++ b/examples/oft_dreambooth/oft_dreambooth_inference.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "acd7b15e",
+   "metadata": {},
+   "source": [
+    "# Dreambooth with OFT\n",
+    "This Notebook assumes that you already ran the train_dreambooth.py script to create your own adapter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acab479f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import DiffusionPipeline\n",
+    "from diffusers.utils import check_min_version, get_logger\n",
+    "from peft import PeftModel\n",
+    "\n",
+    "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n",
+    "check_min_version(\"0.10.0.dev0\")\n",
+    "\n",
+    "logger = get_logger(__name__)\n",
+    "\n",
+    "BASE_MODEL_NAME = \"stabilityai/stable-diffusion-2-1-base\"\n",
+    "ADAPTER_MODEL_PATH = \"INSERT MODEL PATH HERE\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = DiffusionPipeline.from_pretrained(\n",
+    "    BASE_MODEL_NAME,\n",
+    ")\n",
+    "pipe.to('cuda')\n",
+    "pipe.unet = PeftModel.from_pretrained(pipe.unet, ADAPTER_MODEL_PATH + \"/unet\", adapter_name=\"default\")\n",
+    "pipe.text_encoder = PeftModel.from_pretrained(pipe.text_encoder, ADAPTER_MODEL_PATH + \"/text_encoder\", adapter_name=\"default\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"A photo of a sks dog\"\n",
+    "image = pipe(\n",
+    "    prompt,\n",
+    "    num_inference_steps=50,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "image"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/oft_dreambooth/train_dreambooth.py b/examples/oft_dreambooth/train_dreambooth.py
new file mode 100644
index 0000000000..cacce70647
--- /dev/null
+++ b/examples/oft_dreambooth/train_dreambooth.py
@@ -0,0 +1,1112 @@
+import argparse
+import gc
+import hashlib
+import itertools
+import logging
+import math
+import os
+import threading
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Optional
+
+import datasets
+import diffusers
+import numpy as np
+import psutil
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+from peft import get_peft_model
+from peft.tuners.oft.config import OFTConfig
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+
+logger = get_logger(__name__)
+
+UNET_TARGET_MODULES = ["to_q", "to_v", "query", "value"]  # , "ff.net.0.proj"]
+TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"]
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+
+    # oft args
+    parser.add_argument("--use_oft", action="store_true", help="Whether to use OFT for parameter efficient tuning")
+    parser.add_argument("--oft_r", type=int, default=8, help="OFT rank, only used if use_oft is True")
+    parser.add_argument("--oft_alpha", type=int, default=32, help="OFT alpha, only used if use_oft is True")
+    parser.add_argument("--oft_dropout", type=float, default=0.0, help="OFT dropout, only used if use_oft is True")
+    parser.add_argument(
+        "--oft_use_coft", action="store_true", help="Using constrained OFT, only used if use_oft is True"
+    )
+    parser.add_argument(
+        "--oft_eps",
+        type=float,
+        default=0.0,
+        help="The control strength of COFT. Only has an effect if `oft_use_coft` is set to True.",
+    )
+
+    parser.add_argument(
+        "--oft_text_encoder_r",
+        type=int,
+        default=8,
+        help="OFT rank for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_alpha",
+        type=int,
+        default=32,
+        help="OFT alpha for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_dropout",
+        type=float,
+        default=0.0,
+        help="OFT dropout for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_use_coft",
+        action="store_true",
+        help="Using constrained OFT on the text encoder, only used if use_oft is True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_eps",
+        type=float,
+        default=0.0,
+        help="The control strength of COFT on the text encoder. Only has an effect if `oft_text_encoder_use_coft` is set to True.",
+    )
+
+    parser.add_argument(
+        "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
+    )
+
+    parser.add_argument(
+        "--no_tracemalloc",
+        default=False,
+        action="store_true",
+        help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--wandb_key",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
+    )
+    parser.add_argument(
+        "--wandb_project_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+# Converting Bytes to Megabytes
+def b2mb(x):
+    return int(x / 2**20)
+
+
+# This context manager is used to track the peak memory usage of the process
+class TorchTracemalloc:
+    def __enter__(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
+        self.begin = torch.cuda.memory_allocated()
+        self.process = psutil.Process()
+
+        self.cpu_begin = self.cpu_mem_used()
+        self.peak_monitoring = True
+        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
+        peak_monitor_thread.daemon = True
+        peak_monitor_thread.start()
+        return self
+
+    def cpu_mem_used(self):
+        """get resident set size memory for the current process"""
+        return self.process.memory_info().rss
+
+    def peak_monitor_func(self):
+        self.cpu_peak = -1
+
+        while True:
+            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)
+
+            # can't sleep or will not catch the peak right (this comment is here on purpose)
+            # time.sleep(0.001) # 1msec
+
+            if not self.peak_monitoring:
+                break
+
+    def __exit__(self, *exc):
+        self.peak_monitoring = False
+
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.end = torch.cuda.memory_allocated()
+        self.peak = torch.cuda.max_memory_allocated()
+        self.used = b2mb(self.end - self.begin)
+        self.peaked = b2mb(self.peak - self.begin)
+
+        self.cpu_end = self.cpu_mem_used()
+        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
+        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
+        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_dir=logging_dir,
+    )
+    if args.report_to == "wandb":
+        import wandb
+
+        wandb.login(key=args.wandb_key)
+        wandb.init(project=args.wandb_project_name)
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)  # noqa: F841
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )  # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    if args.use_oft:
+        config = OFTConfig(
+            r=args.oft_r,
+            alpha=args.oft_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            module_dropout=args.oft_dropout,
+            init_weights=True,
+            coft=args.oft_use_coft,
+            eps=args.oft_eps,
+        )
+        unet = get_peft_model(unet, config)
+        unet.print_trainable_parameters()
+        print(unet)
+
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+    elif args.train_text_encoder and args.use_oft:
+        config = OFTConfig(
+            r=args.oft_text_encoder_r,
+            alpha=args.oft_text_encoder_alpha,
+            target_modules=TEXT_ENCODER_TARGET_MODULES,
+            module_dropout=args.oft_text_encoder_dropout,
+            init_weights=True,
+            coft=args.oft_text_encoder_use_coft,
+            eps=args.oft_text_encoder_eps,
+        )
+        text_encoder = get_peft_model(text_encoder, config)
+        text_encoder.print_trainable_parameters()
+        print(text_encoder)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        # below fails when using oft so commenting it out
+        if args.train_text_encoder and not args.use_oft:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.num_dataloader_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc:
+            for step, batch in enumerate(train_dataloader):
+                # Skip steps until we reach the resumed step
+                if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        if args.report_to == "wandb":
+                            accelerator.print(progress_bar)
+                    continue
+
+                with accelerator.accumulate(unet):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * 0.18215
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    )
+                    timesteps = timesteps.long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                    if args.with_prior_preservation:
+                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                        target, target_prior = torch.chunk(target, 2, dim=0)
+
+                        # Compute instance loss
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                        # Compute prior loss
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                        # Add the prior loss to the instance loss.
+                        loss = loss + args.prior_loss_weight * prior_loss
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        params_to_clip = (
+                            itertools.chain(unet.parameters(), text_encoder.parameters())
+                            if args.train_text_encoder
+                            else unet.parameters()
+                        )
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    if args.report_to == "wandb":
+                        accelerator.print(progress_bar)
+                    global_step += 1
+
+                logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                accelerator.log(logs, step=global_step)
+
+                if (
+                    args.validation_prompt is not None
+                    and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0
+                ):
+                    logger.info(
+                        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                        f" {args.validation_prompt}."
+                    )
+                    # create pipeline
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    # set `keep_fp32_wrapper` to True because we do not want to remove
+                    # mixed precision hooks while we are still training
+                    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+                    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    # run inference
+                    if args.seed is not None:
+                        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                    else:
+                        generator = None
+                    images = []
+                    for _ in range(args.num_validation_images):
+                        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                        images.append(image)
+
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            import wandb
+
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+
+                if global_step >= args.max_train_steps:
+                    break
+        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+
+        if not args.no_tracemalloc:
+            accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+            accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+            accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
+            accelerator.print(
+                "GPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.peaked + b2mb(tracemalloc.begin)
+                )
+            )
+
+            accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
+            accelerator.print(
+                "CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used)
+            )
+            accelerator.print(
+                "CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked)
+            )
+            accelerator.print(
+                "CPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+                )
+            )
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.use_oft:
+            unwarpped_unet = accelerator.unwrap_model(unet)
+            unwarpped_unet.save_pretrained(
+                os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet)
+            )
+            if args.train_text_encoder:
+                unwarpped_text_encoder = accelerator.unwrap_model(text_encoder)
+                unwarpped_text_encoder.save_pretrained(
+                    os.path.join(args.output_dir, "text_encoder"),
+                    state_dict=accelerator.get_state_dict(text_encoder),
+                )
+        else:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                unet=accelerator.unwrap_model(unet),
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                revision=args.revision,
+            )
+            pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 4d9380e697..75ddda498c 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -68,6 +68,8 @@
     PromptTuningInit,
     MultitaskPromptTuningConfig,
     MultitaskPromptTuningInit,
+    OFTConfig,
+    OFTModel,
 )
 from .utils import (
     TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
diff --git a/src/peft/mapping.py b/src/peft/mapping.py
index f69e89ec3e..60503fa985 100644
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@@ -42,6 +42,8 @@
     LoraConfig,
     LoraModel,
     MultitaskPromptTuningConfig,
+    OFTConfig,
+    OFTModel,
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptTuningConfig,
@@ -73,6 +75,7 @@
     "ADALORA": AdaLoraConfig,
     "IA3": IA3Config,
     "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
+    "OFT": OFTConfig,
 }
 
 PEFT_TYPE_TO_TUNER_MAPPING = {
@@ -81,6 +84,7 @@
     "LOKR": LoKrModel,
     "ADALORA": AdaLoraModel,
     "IA3": IA3Model,
+    "OFT": OFTModel,
 }
 
 
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index 24ef48c22e..79bf8e4610 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -44,6 +44,7 @@
     LoKrModel,
     LoraModel,
     MultitaskPromptEmbedding,
+    OFTModel,
     PrefixEncoder,
     PromptEmbedding,
     PromptEncoder,
@@ -77,6 +78,7 @@
     PeftType.ADALORA: AdaLoraModel,
     PeftType.ADAPTION_PROMPT: AdaptionPromptModel,
     PeftType.IA3: IA3Model,
+    PeftType.OFT: OFTModel,
 }
 
 
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index 666e29d997..f5f665dd99 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -27,3 +27,4 @@
 from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
 from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+from .oft import OFTConfig, OFTModel
diff --git a/src/peft/tuners/oft/__init__.py b/src/peft/tuners/oft/__init__.py
new file mode 100644
index 0000000000..456c46ee07
--- /dev/null
+++ b/src/peft/tuners/oft/__init__.py
@@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import OFTConfig
+from .layer import Conv2d, Linear, OFTLayer
+from .model import OFTModel
+
+
+__all__ = ["OFTConfig", "OFTModel", "Conv2d", "Linear", "OFTLayer"]
diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
new file mode 100644
index 0000000000..6b43255d1d
--- /dev/null
+++ b/src/peft/tuners/oft/config.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class OFTConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`OFTModel`].
+
+    Args:
+        r (`int`): OFT rank.
+        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply OFT to.
+        init_weights (`bool`): Whether to perform initialization of OFT weights.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the OFT transformations on the
+            layer indexes that are specified in this list. If a single integer is passed, it will apply the OFT
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        modules_to_save (`List[str]`): The names of modules to be set as trainable except OFT parameters.
+        coft (`bool`): Whether to use the constrainted variant of OFT or not.
+        eps (`float`):
+            The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+        block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+    """
+
+    r: int = field(default=8, metadata={"help": "OFT rank"})
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with OFT."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the OFT layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    coft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the constrainted variant of OFT or not."},
+    )
+    eps: float = field(
+        default=6e-5,
+        metadata={
+            "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True."
+        },
+    )
+    block_share: bool = field(
+        default=False,
+        metadata={"help": "Whether to share the OFT parameters between blocks or not."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.OFT
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/src/peft/tuners/oft/layer.py b/src/peft/tuners/oft/layer.py
new file mode 100644
index 0000000000..b9e0d011b3
--- /dev/null
+++ b/src/peft/tuners/oft/layer.py
@@ -0,0 +1,375 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Any, List, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+
+from peft.tuners.lycoris_utils import LycorisLayer
+
+
+class OFTLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("oft_r",)
+    # other_param_names is defined on parent class
+
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+
+        # OFT info
+        self.oft_r = nn.ParameterDict({})
+        self.coft = {}
+        self.eps = {}
+        self.block_share = {}
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.oft_r}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool):
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+        else:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        nn.init.zeros_(self.oft_r[adapter_name])
+
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        module_dropout: float,
+        init_weights: bool,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create oft adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            coft (`bool`): Whether to use the constrainted variant of OFT or not.
+            eps (`float`):
+                The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+            block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+        """
+
+        self.r[adapter_name] = r
+        self.module_dropout[adapter_name] = module_dropout
+        self.coft[adapter_name] = coft
+        self.block_share[adapter_name] = block_share
+
+        # Determine shape of OFT weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            shape = (
+                base_layer.out_channels,
+                base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+            )
+        else:
+            raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}")
+
+        self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r)
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, block_share)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def unscale_layer(self, scale=None) -> None:
+        # scale is not used
+        pass
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+
+                orig_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = orig_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if orig_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]]
+                new_weights = torch.mm(orig_weights, delta_weight)
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+
+                if safe_merge and not torch.isfinite(new_weights).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                base_layer.weight.data = new_weights
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                new_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if new_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]]
+                delta_inv = torch.inverse(delta_weight)
+                orig_weights = torch.mm(new_weights, delta_inv)
+
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights.reshape(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+                base_layer.weight.data = orig_weights
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        rank = self.r[adapter_name]
+        coft = self.coft[adapter_name]
+        eps = self.eps[adapter_name]
+        opt_r = self.oft_r[adapter_name]
+
+        if coft:
+            with torch.no_grad():
+                opt_r.copy_(self._project_batch(opt_r, eps=eps))
+
+        orth_rotate = self._cayley_batch(opt_r)
+        weight = self._block_diagonal(orth_rotate, rank)
+
+        return weight
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144
+    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)
+
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+
+        return Q
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
+    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
+        if oft_r.shape[0] == 1:
+            # block share
+            blocks = [oft_r[0, ...] for i in range(rank)]
+        else:
+            blocks = [oft_r[i, ...] for i in range(rank)]
+
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+
+        return A
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
+    def _project_batch(self, oft_r, eps=1e-5):
+        # scaling factor for each of the smaller block matrix
+        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
+        I = (
+            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
+            .unsqueeze(0)
+            .expand_as(oft_r)
+        )
+        diff = oft_r - I
+        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
+        mask = (norm_diff <= eps).bool()
+        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
+        return out
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            if len(result.shape) == 4:
+                result = result.permute(0, 2, 3, 1)
+
+            base_layer = self.get_base_layer()
+            base_bias = base_layer.bias
+            if base_bias is not None:
+                # Bias should be added after OFT forward
+                result = result - base_bias.data
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = self._get_delta_activations(active_adapter, result, *args, **kwargs)
+
+            if base_bias is not None:
+                result = result + base_bias.data
+            if len(result.shape) == 4:
+                result = result.permute(0, 3, 1, 2)
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Linear(OFTLayer):
+    """OFT implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
+
+
+class Conv2d(OFTLayer):
+    """OFT implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
diff --git a/src/peft/tuners/oft/model.py b/src/peft/tuners/oft/model.py
new file mode 100644
index 0000000000..4b7953daa9
--- /dev/null
+++ b/src/peft/tuners/oft/model.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, Type, Union
+
+import torch
+from torch import nn
+
+from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+
+from .layer import Conv2d, Linear, OFTLayer
+
+
+class OFTModel(LycorisTuner):
+    """
+    Creates Orthogonal Finetuning model from a pretrained model. The method is described in
+    https://arxiv.org/abs/2306.07280
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`OFTConfig`]): The configuration of the OFT model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The OFT model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import OFTModel, OFTConfig
+
+        >>> config_te = OFTConfig(
+        ...     r=8,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = OFTConfig(
+        ...     r=8,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default")
+        >>> model.unet = OFTModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`OFTConfig`]): The configuration of the OFT model.
+    """
+
+    prefix: str = "oft_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[OFTLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+        **optional_kwargs,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(config.rank_pattern.keys())
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+
+        if isinstance(target, OFTLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index 29c764a08f..93b892d9e5 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -30,6 +30,7 @@ class PeftType(str, enum.Enum):
     IA3 = "IA3"
     LOHA = "LOHA"
     LOKR = "LOKR"
+    OFT = "OFT"
 
 
 class TaskType(str, enum.Enum):
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 97bde0d6fe..c5da274085 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -113,6 +113,8 @@ def get_peft_model_state_dict(
         to_return["prompt_embeddings"] = prompt_embeddings
     elif config.peft_type == PeftType.IA3:
         to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k}
+    elif config.peft_type == PeftType.OFT:
+        to_return = {k: state_dict[k] for k in state_dict if "oft_" in k}
     else:
         raise NotImplementedError
     if getattr(model, "modules_to_save", None) is not None:
@@ -166,7 +168,7 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
     else:
         state_dict = peft_model_state_dict
 
-    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3):
+    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3, PeftType.OFT):
         peft_model_state_dict = {}
         parameter_prefix = {
             PeftType.IA3: "ia3_",
@@ -174,6 +176,7 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
             PeftType.ADALORA: "lora_",
             PeftType.LOHA: "hada_",
             PeftType.LOKR: "lokr_",
+            PeftType.OFT: "oft_",
         }[config.peft_type]
         for k, v in state_dict.items():
             if parameter_prefix in k:
diff --git a/tests/test_config.py b/tests/test_config.py
index 34f04232a9..06e72dae8e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -30,6 +30,7 @@
     LoHaConfig,
     LoraConfig,
     MultitaskPromptTuningConfig,
+    OFTConfig,
     PeftConfig,
     PrefixTuningConfig,
     PromptEncoder,
@@ -51,6 +52,7 @@
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptTuningConfig,
+    OFTConfig,
 )
 
 
@@ -189,7 +191,7 @@ def test_prompt_encoder_warning_num_layers(self):
         expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
         assert str(record.list[0].message) == expected_msg
 
-    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config])
+    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config, OFTConfig])
     def test_save_pretrained_with_target_modules(self, config_class):
         # See #1041, #1045
         config = config_class(target_modules=["a", "list"])
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index b298388a84..4785526b26 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -24,7 +24,7 @@
 from torch import nn
 from transformers.pytorch_utils import Conv1D
 
-from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoKrConfig, LoraConfig, PeftModel, get_peft_model
+from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, PeftModel, get_peft_model
 from peft.tuners.tuners_utils import BaseTunerLayer
 
 from .testing_common import PeftCommonTester
@@ -191,6 +191,28 @@
             "decompose_factor": 4,
         },
     ),
+    ########
+    # OFT #
+    ########
+    ("Vanilla MLP 1 OFT", "MLP", OFTConfig, {"target_modules": "lin0"}),
+    ("Vanilla MLP 2 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"]}),
+    ("Vanilla MLP 5 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}),
+    (
+        "Vanilla MLP 6 OFT",
+        "MLP",
+        OFTConfig,
+        {
+            "target_modules": ["lin0"],
+            "module_dropout": 0.1,
+        },
+    ),
+    ("Vanilla MLP 7 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True}),
+    ("Vanilla MLP 8 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "block_share": True}),
+    ("Vanilla MLP 9 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True, "block_share": True}),
+    ("Conv2d 1 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"]}),
+    ("Conv2d 3 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True}),
+    ("Conv2d 4 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "block_share": True}),
+    ("Conv2d 5 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True, "block_share": True}),
 ]
 
 MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES = [
@@ -258,6 +280,7 @@
     LoraConfig: "lora_",
     LoHaConfig: "hada_",
     LoKrConfig: "lokr_",
+    OFTConfig: "oft_",
 }
 
 
@@ -833,6 +856,7 @@ def test_targeting_lora_to_embedding_layer_non_transformers(self, save_embedding
             LoHaConfig(target_modules=["lin0"], init_weights=False),
             AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
             IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False),
+            OFTConfig(target_modules=["lin0"], init_weights=False),
         ]
     )
     def test_adapter_name_makes_no_difference(self, config0):
@@ -1852,3 +1876,80 @@ def test_requires_grad_lokr_same_targets(self):
             "base_model.model.lin0.lokr_w1.adapter1",
             "base_model.model.lin0.lokr_w2.adapter1",
         )
+
+    def test_requires_grad_oft_different_targets(self):
+        # test two different OFT adapters that target different modules
+        config0 = OFTConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = OFTConfig(target_modules=["lin1"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active pter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # change activate pter to pter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.oft_r.adapter1",
+        )
+
+        # disable all pters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.oft_r.adapter1",
+        )
+
+    def test_requires_grad_oft_same_targets(self):
+        # same as previous test, except that OFT adapters target the same layer
+        config0 = OFTConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = OFTConfig(target_modules=["lin0"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.adapter1",
+        )
diff --git a/tests/test_stablediffusion.py b/tests/test_stablediffusion.py
index 830614a7ab..660c17caea 100644
--- a/tests/test_stablediffusion.py
+++ b/tests/test_stablediffusion.py
@@ -20,7 +20,7 @@
 from diffusers import StableDiffusionPipeline
 from parameterized import parameterized
 
-from peft import LoHaConfig, LoraConfig, get_peft_model
+from peft import LoHaConfig, LoraConfig, OFTConfig, get_peft_model
 
 from .testing_common import ClassInstantier, PeftCommonTester
 from .testing_utils import temp_seed
@@ -60,11 +60,24 @@
             "module_dropout": 0.0,
         },
     },
+    {
+        "text_encoder": {
+            "r": 8,
+            "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+            "module_dropout": 0.0,
+        },
+        "unet": {
+            "r": 8,
+            "target_modules": ["proj_in", "proj_out", "to_k", "to_q", "to_v", "to_out.0", "ff.net.0.proj", "ff.net.2"],
+            "module_dropout": 0.0,
+        },
+    },
 )
 CLASSES_MAPPING = {
     "lora": (LoraConfig, CONFIG_TESTING_KWARGS[0]),
     "loha": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
     "lokr": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
+    "oft": (OFTConfig, CONFIG_TESTING_KWARGS[2]),
 }
 
 
@@ -115,13 +128,14 @@ def prepare_inputs_for_testing(self):
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "loha_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
             },
         )
     )
     def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
-        if config_cls == LoHaConfig:
+        if config_cls in [LoHaConfig, OFTConfig]:
             # TODO: This test is flaky with PyTorch 2.1 on Windows, we need to figure out what is going on
-            self.skipTest("LoHaConfig test is flaky")
+            self.skipTest("LoHaConfig and OFTConfig test is flaky")
 
         # Instantiate model & adapters
         model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs)
@@ -148,7 +162,7 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
             },
-            filter_params_func=lambda tests: [x for x in tests if all(s not in x[0] for s in ["loha", "lokr"])],
+            filter_params_func=lambda tests: [x for x in tests if all(s not in x[0] for s in ["loha", "lokr", "oft"])],
         )
     )
     def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_cls, config_kwargs):
@@ -178,6 +192,7 @@ def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_c
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "loha_kwargs": {"init_weights": [False]},
                 "lokr_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
             },
         )
     )
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 00809c2bc1..0c081cde2c 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -574,7 +574,7 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs):
         self.assertTrue(torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol))
 
     def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs):
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
 
         if ("gpt2" in model_id.lower()) and (config_cls == IA3Config):
             self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)")
@@ -886,7 +886,7 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar
             self.assertIsNotNone(param.grad)
 
     def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(
@@ -924,7 +924,7 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
 
     def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
         # same as test_delete_adapter, but this time an inactive adapter is deleted
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(

From 6a57472665b2b712a84e2bedd98945038283f7cc Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Thu, 30 Nov 2023 21:58:16 +0100
Subject: [PATCH 65/65] Mixed adapter models (#1163)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Description

This PR allows to add adapters of different types, e.g. LoRA and LoHa:

base_model = ...
config0 = LoraConfig(...)
peft_model = get_peft_model(base_model, config0, mixed=True)
config1 = LoHaConfig(...)
peft_model.add_adapter(config1, "other")
peft_model.set_adapter(["default", "other"])
peft_model(x)

At this point, both adapters are active at the same time.

Existing code should not be affected by this change, since users need to
opt into this behavior by setting mixed=True, and a completely different
class is being used (PeftMixedModel).

Also interesting is that this method can be used for a single adapter
type but with very different configs. Right now, we have limited support
for that (e.g. for LoRA, different r values by using rank_pattern), but
with this, we don't need to special case the differing arguments
anymore.

Not implemented

- [ ] I'm not yet sure if the same logic can be applied to IA³ or if it
  may fail because IA³ can apply its scaling to the input, not the output.
- [ ] OFT is not supported yet but should work.
- [ ] It is currently not possible to represent a mixed adapter model as
  a single config. I think we can come up with a solution but I don't
  think it is necessary for a first version of this.
- [ ] Saving and loading is not yet implemented for mixed models.

Those could potentially be added in a future PR.

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
---
 README.md                                    |  31 +
 docs/source/_toctree.yml                     |   2 +
 docs/source/developer_guides/mixed_models.md |  39 +
 src/peft/__init__.py                         |   1 +
 src/peft/mapping.py                          |  19 +-
 src/peft/mixed_model.py                      | 394 +++++++++
 src/peft/tuners/__init__.py                  |   1 +
 src/peft/tuners/mixed/__init__.py            |  19 +
 src/peft/tuners/mixed/model.py               | 323 ++++++++
 tests/test_mixed.py                          | 794 +++++++++++++++++++
 10 files changed, 1620 insertions(+), 3 deletions(-)
 create mode 100644 docs/source/developer_guides/mixed_models.md
 create mode 100644 src/peft/mixed_model.py
 create mode 100644 src/peft/tuners/mixed/__init__.py
 create mode 100644 src/peft/tuners/mixed/model.py
 create mode 100644 tests/test_mixed.py

diff --git a/README.md b/README.md
index 09846dc61c..06a757ed90 100644
--- a/README.md
+++ b/README.md
@@ -367,6 +367,8 @@ any GPU memory savings. Please refer issue [[FSDP] FSDP with CPU offload consume
 
 ## 🤗 PEFT as a utility library
 
+### Injecting adapters directly into the model
+
 Inject trainable adapters on any `torch` model using `inject_adapter_in_model` method. Note the method will make no further change to the model.
 
 ```python
@@ -403,6 +405,35 @@ dummy_outputs = model(dummy_inputs)
 
 Learn more about the [low level API in the docs](https://huggingface.co/docs/peft/developer_guides/low_level_api).
 
+### Mixing different adapter types
+
+Ususally, it is not possible to combine different adapter types in the same model, e.g. combining LoRA with AdaLoRA, LoHa, or LoKr. Using a mixed model, this can, however, be achieved:
+
+```python
+from peft import PeftMixedModel
+
+model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM").eval()
+peft_model = PeftMixedModel.from_pretrained(model, <path-to-adapter-0>, "adapter0")
+peft_model.load_adapter(<path-to-adapter-1>, "adapter1")
+peft_model.set_adapter(["adapter0", "adapter1"])
+result = peft_model(**inputs)
+```
+
+The main intent is to load already trained adapters and use this only for inference. However, it is also possible to create a PEFT model for training by passing `mixed=True` to `get_peft_model`:
+
+```python
+from peft import get_peft_model, LoraConfig, LoKrConfig
+
+base_model = ...
+config0 = LoraConfig(...)
+config1 = LoKrConfig(...)
+peft_model = get_peft_model(base_model, config0, "adapter0", mixed=True)
+peft_model.add_adapter(config1, "adapter1")
+peft_model.set_adapter(["adapter0", "adapter1"])
+for batch in dataloader:
+    ...
+```
+
 ## Contributing
 
 If you would like to contribute to PEFT, please check out our [contributing guide](https://huggingface.co/docs/peft/developer_guides/contributing).
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 88bedf31d7..25992b3966 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -34,6 +34,8 @@
     title: Working with custom models
   - local: developer_guides/low_level_api
     title: PEFT low level API
+  - local: developer_guides/mixed_models
+    title: Mixing different adapter types
   - local: developer_guides/contributing
     title: Contributing to PEFT
   - local: developer_guides/troubleshooting
diff --git a/docs/source/developer_guides/mixed_models.md b/docs/source/developer_guides/mixed_models.md
new file mode 100644
index 0000000000..93414eee04
--- /dev/null
+++ b/docs/source/developer_guides/mixed_models.md
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Working with mixed adapter types
+
+Normally, it is not possible to mix different adapter types in 🤗 PEFT. For example, even though it is possible to create a PEFT model that has two different LoRA adapters (that can have different config options), it is not possible to combine a LoRA adapter with a LoHa adapter. However, by using a mixed model, this works as long as the adapter types are compatible.
+
+## Loading different adapter types into a PEFT model
+
+To load different adapter types into a PEFT model, proceed the same as if you were loading two adapters of the same type, but use `PeftMixedModel` instead of `PeftModel`:
+
+```py
+from peft import PeftMixedModel
+
+base_model = ...  # load the base model, e.g. from transformers
+# load first adapter, which will be called "default"
+peft_model = PeftMixedModel.from_pretrained(base_model, <path_to_adapter1>)
+peft_model.load_adapter(<path_to_adapter2>, adapter_name="other")
+peft_model.set_adapter(["default", "other"])
+```
+
+The last line is necessary if you want to activate both adapters, otherwise, only the first adapter would be active. Of course, you can add more different adapters by calling `add_adapter` repeatedly.
+
+Currently, the main purpose of mixed adapter types is to combine trained adapters for inference. Although it is technically also possible to train a mixed adapter model, this has not been tested and is not recommended.
+
+## Tips
+
+- Not all adapter types can be combined. See `peft.tuners.mixed.COMPATIBLE_TUNER_TYPES` for a list of compatible types. An error will be raised if you are trying to combine incompatible adapter types.
+- It is possible to mix multiple adapters of the same type. This can be useful to combine adapters with very different configs.
+- If you want to combine a lot of different adapters, it is most performant to add the same types of adapters consecutively. E.g., add LoRA1, LoRA2, LoHa1, LoHa2 in this order, instead of LoRA1, LoHa1, LoRA2, LoHa2. The order will make a difference for the outcome in most cases, but since no order is better a priori, it is best to choose the order that is most performant.
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 75ddda498c..2b1883ebd7 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -35,6 +35,7 @@
     get_peft_model,
     inject_adapter_in_model,
 )
+from .mixed_model import PeftMixedModel
 from .peft_model import (
     PeftModel,
     PeftModelForCausalLM,
diff --git a/src/peft/mapping.py b/src/peft/mapping.py
index 60503fa985..f34bdb51c5 100644
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@@ -20,6 +20,7 @@
 import torch
 
 from .config import PeftConfig
+from .mixed_model import PeftMixedModel
 from .peft_model import (
     PeftModel,
     PeftModelForCausalLM,
@@ -99,13 +100,21 @@ def get_peft_config(config_dict: Dict[str, Any]) -> PeftConfig:
     return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)
 
 
-def get_peft_model(model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default") -> PeftModel:
+def get_peft_model(
+    model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default", mixed: bool = False
+) -> PeftModel | PeftMixedModel:
     """
     Returns a Peft model object from a model and a config.
 
     Args:
-        model ([`transformers.PreTrainedModel`]): Model to be wrapped.
-        peft_config ([`PeftConfig`]): Configuration object containing the parameters of the Peft model.
+        model ([`transformers.PreTrainedModel`]):
+            Model to be wrapped.
+        peft_config ([`PeftConfig`]):
+            Configuration object containing the parameters of the Peft model.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
+        mixed (`bool`, `optional`, defaults to `False`):
+            Whether to allow mixing different (compatible) adapter types.
     """
     model_config = getattr(model, "config", {"model_type": "custom"})
     if hasattr(model_config, "to_dict"):
@@ -113,8 +122,12 @@ def get_peft_model(model: PreTrainedModel, peft_config: PeftConfig, adapter_name
 
     peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None)
 
+    if mixed:
+        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
+
     if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
         return PeftModel(model, peft_config, adapter_name=adapter_name)
+
     if peft_config.is_prompt_learning:
         peft_config = _prepare_prompt_learning_config(peft_config, model_config)
     return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
diff --git a/src/peft/mixed_model.py b/src/peft/mixed_model.py
new file mode 100644
index 0000000000..55892851e9
--- /dev/null
+++ b/src/peft/mixed_model.py
@@ -0,0 +1,394 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from typing import Any, Optional, Union
+
+import torch
+from accelerate.hooks import remove_hook_from_submodules
+from torch import nn
+from transformers.utils import PushToHubMixin
+
+from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
+
+from .config import PeftConfig
+from .peft_model import PeftModel
+from .tuners import (
+    AdaLoraModel,
+    IA3Model,
+    LoHaModel,
+    LoKrModel,
+    LoraModel,
+    MixedModel,
+)
+from .utils import PeftType, _set_adapter, _set_trainable
+
+
+PEFT_TYPE_TO_MODEL_MAPPING = {
+    PeftType.LORA: LoraModel,
+    PeftType.LOHA: LoHaModel,
+    PeftType.LOKR: LoKrModel,
+    PeftType.ADALORA: AdaLoraModel,
+    PeftType.IA3: IA3Model,
+}
+
+
+def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None:
+    r"""
+    Prepares the model for gradient checkpointing if necessary
+    """
+    # Note: same as PeftModel._prepare_model_for_gradient_checkpointing
+    if not getattr(model, "is_gradient_checkpointing", True):
+        return model
+
+    if not (
+        getattr(model, "is_loaded_in_8bit", False)
+        or getattr(model, "is_loaded_in_4bit", False)
+        or getattr(model, "is_quantized", False)
+    ):
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        elif hasattr(model, "get_input_embeddings"):
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+
+def _check_config_compatible(peft_config: PeftConfig) -> None:
+    if peft_config.peft_type not in COMPATIBLE_TUNER_TYPES:
+        raise ValueError(
+            f"The provided `peft_type` '{peft_config.peft_type.value}' is not compatible with the `PeftMixedModel`. "
+            f"Compatible types are: {COMPATIBLE_TUNER_TYPES}"
+        )
+
+
+class PeftMixedModel(PushToHubMixin, torch.nn.Module):
+    """
+    Peft model for mixing different types of adapters.
+
+    This class currently does not support saving and loading. Instead, it is assumed that the adapters are already
+    trained and loading the model requires a script to be run each time.
+
+    Currently, the main purpose of mixed adapter types is to combine trained adapters for inference. Although it is
+    technically possible to train a mixed adapter model, this has not been tested and is not recommended.
+
+    Note: This class should usually not be initialized directly. Instead, use `get_peft_model` with the argument
+    `mixed=True`.
+
+    Below is an example that shows how to load a mixed model with two different types of adapters.
+
+    ```py
+    >>> from peft import get_peft_model
+
+    >>> base_model = ...  # load the base model, e.g. from transformers
+    >>> peft_model = PeftMixedModel.from_pretrained(base_model, path_to_adapter1, "adapter1").eval()
+    >>> peft_model.load_adapter(path_to_adapter2, "adapter2")
+    >>> peft_model.set_adapter(["adapter1", "adapter2"])  # activate both adapters
+    >>> peft_model(data)  # forward pass using both adapters
+    ```
+
+    Tips:
+
+    - Not all adapter types can be combined. See `peft.tuners.mixed.COMPATIBLE_TUNER_TYPES` for a list of compatible
+      types. An error will be raised if you are trying to combine incompatible adapter types.
+    - It is possible to mix multiple adapters of the same type. This can be useful to combine adapters with very
+      different configs.
+    - If you want to combine a lot of different adapters, it is most performant to add the same types of adapters
+      consecutively. E.g., add LoRA1, LoRA2, LoHa1, LoHa2 in this order, instead of LoRA1, LoHa1, LoRA2, LoHa2. As long
+      as the adapters are commutative, the order does not matter for the final result.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to be tuned.
+        config (`PeftConfig`):
+            The config of the model to be tuned. The adapter type must be compatible.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the first adapter.
+    """
+
+    def __init__(self, model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__()
+        _check_config_compatible(peft_config)
+        _prepare_model_for_gradient_checkpointing(model)
+        self.modules_to_save = None
+        self.base_model = MixedModel(model, {adapter_name: peft_config}, adapter_name)
+        self.set_modules_to_save(peft_config, adapter_name)
+
+        self.config = getattr(model, "config", {"model_type": "custom"})
+
+        # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid
+        # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected
+        # behavior we disable that in this line.
+        if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
+            self.base_model.config.pretraining_tp = 1
+
+    @property
+    def peft_config(self) -> dict[str, PeftConfig]:
+        return self.base_model.peft_config
+
+    @property
+    def active_adapter(self) -> str:
+        return self.base_model.active_adapter
+
+    @property
+    def active_adapters(self) -> list[str]:
+        return self.base_model.active_adapters
+
+    def get_nb_trainable_parameters(self):
+        r"""
+        Returns the number of trainable parameters and number of all parameters in the model.
+        """
+        # note: same as PeftModel.get_nb_trainable_parameters
+        trainable_params = 0
+        all_param = 0
+        for _, param in self.named_parameters():
+            num_params = param.numel()
+            # if using DS Zero 3 and the weights are initialized empty
+            if num_params == 0 and hasattr(param, "ds_numel"):
+                num_params = param.ds_numel
+
+            # Due to the design of 4bit linear layers from bitsandbytes
+            # one needs to multiply the number of parameters by 2 to get
+            # the correct number of parameters
+            if param.__class__.__name__ == "Params4bit":
+                num_params = num_params * 2
+
+            all_param += num_params
+            if param.requires_grad:
+                trainable_params += num_params
+
+        return trainable_params, all_param
+
+    def print_trainable_parameters(self):
+        """
+        Prints the number of trainable parameters in the model.
+        """
+        # note: same as PeftModel.print_trainable_parameters
+        trainable_params, all_param = self.get_nb_trainable_parameters()
+
+        print(
+            f"trainable params: {trainable_params:,d} || "
+            f"all params: {all_param:,d} || "
+            f"trainable%: {100 * trainable_params / all_param:.4f}"
+        )
+
+    def forward(self, *args: Any, **kwargs: Any):
+        """
+        Forward pass of the model.
+        """
+        return self.base_model(*args, **kwargs)
+
+    def generate(self, *args: Any, **kwargs: Any):
+        """
+        Generate output.
+        """
+        return self.base_model.generate(*args, **kwargs)
+
+    @contextmanager
+    def disable_adapter(self):
+        """
+        Disables the adapter module.
+        """
+        try:
+            self.base_model.disable_adapter_layers()
+            yield
+        finally:
+            self.base_model.enable_adapter_layers()
+
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig):
+        _check_config_compatible(peft_config)
+
+        try:
+            self.peft_config[adapter_name] = peft_config
+            self.base_model.inject_adapter(self, adapter_name)
+        except Exception:  # somthing went wrong, roll back
+            if adapter_name in self.peft_config:
+                del self.peft_config[adapter_name]
+            raise
+
+        self.set_modules_to_save(peft_config, adapter_name)
+
+    def set_modules_to_save(self, peft_config: PeftConfig, adapter_name: str) -> None:
+        if (modules_to_save := getattr(peft_config, "modules_to_save", None)) is None:
+            return
+
+        if self.modules_to_save is None:
+            self.modules_to_save = set(modules_to_save)
+        else:
+            self.modules_to_save.update(modules_to_save)
+        _set_trainable(self, adapter_name)
+
+    def set_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        """
+        Sets the active adapter(s) for the model.
+
+        Note that the order in which the adapters are applied during the forward pass may not be the same as the order
+        in which they are passed to this function. Instead, the order during the forward pass is determined by the
+        order in which the adapters were loaded into the model. The active adapters only determine which adapters are
+        active during the forward pass, but not the order in which they are applied.
+
+        Args:
+            adapter_name (`str` or `List[str]`):
+                The name of the adapter(s) to be activated.
+        """
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        mismatched = set(adapter_name) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        self.base_model.set_adapter(adapter_name)
+        _set_adapter(self, adapter_name)
+
+    def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        mismatched = set(adapter_name) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        self.base_model.delete_adapter(adapter_name)
+
+    def merge_and_unload(self, *args: Any, **kwargs: Any):
+        r"""
+        This method merges the adapter layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self.base_model.merge_and_unload(*args, **kwargs)
+
+    def unload(self, *args: Any, **kwargs: Any):
+        """
+        Gets back the base model by removing all the adapter modules without merging. This gives back the original base
+        model.
+        """
+        return self.base_model.unload(*args, **kwargs)
+
+    @classmethod
+    def _split_kwargs(cls, kwargs: dict[str, Any]):
+        return PeftModel._split_kwargs(kwargs)
+
+    def load_adapter(self, model_id: str, adapter_name: str, *args: Any, **kwargs: Any):
+        output = PeftModel.load_adapter(self, model_id, adapter_name, *args, **kwargs)
+        # TODO: not quite clear why this is necessary but tests fail without it
+        self.set_adapter(self.active_adapters)
+        return output
+
+    def create_or_update_model_card(self, output_dir: str):
+        raise NotImplementedError(f"Model card creation is not supported for {self.__class__.__name__} (yet).")
+
+    def save_pretrained(
+        self,
+        save_directory: str,
+        safe_serialization: bool = False,
+        selected_adapters: Optional[list[str]] = None,
+        **kwargs: Any,
+    ):
+        raise NotImplementedError(f"Saving is not supported for {self.__class__.__name__} (yet).")
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model: nn.Module,
+        model_id: str | os.PathLike,
+        adapter_name: str = "default",
+        is_trainable: bool = False,
+        config: Optional[PeftConfig] = None,
+        **kwargs: Any,
+    ):
+        r"""
+        Instantiate a PEFT mixed model from a pretrained model and loaded PEFT weights.
+
+        Note that the passed `model` may be modified inplace.
+
+        Args:
+            model (`nn.Module`):
+                The model to be adapted.
+            model_id (`str` or `os.PathLike`):
+                The name of the PEFT configuration to use. Can be either:
+                    - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face
+                      Hub.
+                    - A path to a directory containing a PEFT configuration file saved using the `save_pretrained`
+                      method (`./my_peft_config_directory/`).
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to be loaded. This is useful for loading multiple adapters.
+            is_trainable (`bool`, *optional*, defaults to `False`):
+                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and use for
+                inference
+            config ([`~peft.PeftConfig`], *optional*):
+                The configuration object to use instead of an automatically loaded configuation. This configuration
+                object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already
+                loaded before calling `from_pretrained`.
+            kwargs: (`optional`):
+                Additional keyword arguments passed along to the specific PEFT configuration class.
+        """
+        # note: adapted from PeftModel.from_pretrained
+        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+
+        # load the config
+        if config is None:
+            config = PEFT_TYPE_TO_CONFIG_MAPPING[
+                PeftConfig._get_peft_type(
+                    model_id,
+                    subfolder=kwargs.get("subfolder", None),
+                    revision=kwargs.get("revision", None),
+                    cache_dir=kwargs.get("cache_dir", None),
+                    use_auth_token=kwargs.get("use_auth_token", None),
+                )
+            ].from_pretrained(model_id, **kwargs)
+        elif isinstance(config, PeftConfig):
+            config.inference_mode = not is_trainable
+        else:
+            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")
+
+        # note: this is different from PeftModel.from_pretrained
+        if config.peft_type not in PEFT_TYPE_TO_MODEL_MAPPING:
+            raise ValueError(f"Adapter of type {config.peft_type} is not supported for mixed models.")
+
+        if (getattr(model, "hf_device_map", None) is not None) and len(
+            set(model.hf_device_map.values()).intersection({"cpu", "disk"})
+        ) > 0:
+            remove_hook_from_submodules(model)
+
+        if config.is_prompt_learning and is_trainable:
+            # note: should not be possible to reach, but just in case
+            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
+        else:
+            config.inference_mode = not is_trainable
+
+        # note: this is different from PeftModel.from_pretrained, we always return a PeftMixedModel
+        model = cls(model, config, adapter_name)
+        model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
+        return model
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index f5f665dd99..9211cfb4f8 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -28,3 +28,4 @@
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
 from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
 from .oft import OFTConfig, OFTModel
+from .mixed import MixedModel
diff --git a/src/peft/tuners/mixed/__init__.py b/src/peft/tuners/mixed/__init__.py
new file mode 100644
index 0000000000..f21cff3b29
--- /dev/null
+++ b/src/peft/tuners/mixed/__init__.py
@@ -0,0 +1,19 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .model import COMPATIBLE_TUNER_TYPES, MixedModel
+
+
+__all__ = ["COMPATIBLE_TUNER_TYPES", "MixedModel"]
diff --git a/src/peft/tuners/mixed/model.py b/src/peft/tuners/mixed/model.py
new file mode 100644
index 0000000000..5e7acf1cfe
--- /dev/null
+++ b/src/peft/tuners/mixed/model.py
@@ -0,0 +1,323 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from typing import Any, Optional, Union
+
+from torch import nn
+from tqdm import tqdm
+
+from peft.tuners import adalora, loha, lokr, lora
+from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    PeftType,
+    _get_submodules,
+    get_auto_gptq_quant_linear,
+)
+
+
+# Collection of constants used for all tuners
+COMPATIBLE_TUNER_TYPES = (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA)
+PREFIXES = [lora.LoraModel.prefix, lokr.LoKrModel.prefix, loha.LoHaModel.prefix]
+Configs = Union[lora.LoraConfig, loha.LoHaConfig, lokr.LoKrConfig, adalora.AdaLoraConfig]
+Layers = (lora.layer.LoraLayer, loha.layer.LoHaLayer, lokr.layer.LoKrLayer, adalora.layer.AdaLoraLayer)
+
+
+class MixedModel(BaseTuner):
+    """
+    A class that allows to mix different types of adapters in a single model.
+
+    Note: This class should usually not be initialized directly. Instead, use `get_peft_model` with the argument
+    `mixed=True`.
+
+    Args:
+        model (:obj:`nn.Module`):
+            The model to be tuned.
+        config (:obj:`PeftConfig`):
+            The config of the model to be tuned. The adapter type must be compatible.
+        adapter_name (:obj:`str`):
+            The name of the first adapter.
+    """
+
+    def __init__(self, model: nn.Module, config: Configs, adapter_name: str) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: Configs) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        if not isinstance(config, Configs.__args__):
+            raise ValueError(
+                f"{self.__class__.__name__} only supports {COMPATIBLE_TUNER_TYPES} configs, but got {type(config)}."
+            )
+
+        biases = (getattr(config, "bias", None) for config in self.peft_config)
+        biases = [bias for bias in biases if bias not in (None, "none")]
+        if len(biases) > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(config: Configs, key: str):
+        return check_target_module_exists(config, key)
+
+    def _create_and_replace(
+        self,
+        config: Configs,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        if isinstance(config, adalora.AdaLoraConfig):
+            adalora.AdaLoraModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, lora.LoraConfig):
+            lora.LoraModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, loha.LoHaConfig):
+            loha.LoHaModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, lokr.LoKrConfig):
+            lokr.LoKrModel._create_and_replace(self, config, *args, **kwargs)
+        else:
+            raise ValueError(f"Unsupported config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.")
+
+    def _replace_module(self, parent, child_name, new_module, child) -> None:
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.get_base_layer()
+        elif hasattr(child, "quant_linear_module"):
+            # TODO maybe not necessary to have special treatment?
+            child = child.quant_linear_module
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if any(prefix in name for prefix in PREFIXES):
+                module.to(child.weight.device)
+            if "ranknum" in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self) -> None:
+        for n, p in self.model.named_parameters():
+            if not any(prefix in n for prefix in PREFIXES):
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = getattr(self.peft_config[active_adapter], "bias", "none")
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in self.model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "lora_only":
+                # TODO: check if this is needed for other supported types
+                for m in self.model.modules():
+                    if isinstance(m, Layers) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise ValueError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(config, adapter_name, target, **kwargs):
+        gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+        if (gptq_quantization_config is not None) or (AutoGPTQQuantLinear is not None):
+            raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).")
+
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+        if loaded_in_8bit or loaded_in_4bit:
+            raise ValueError(f"8bit and 4bit quantization not supported for {config.peft_type.value} (yet).")
+
+        if isinstance(config, adalora.AdaLoraConfig):
+            new_module = adalora.AdaLoraModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, lora.LoraConfig):
+            new_module = lora.LoraModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, loha.LoHaConfig):
+            new_module = loha.LoHaModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, lokr.LoKrConfig):
+            new_module = lokr.LoKrModel._create_new_module(config, adapter_name, target, **kwargs)
+        else:
+            raise ValueError(f"Unknown config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.")
+        return new_module
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = getattr(self.peft_config[active_adapter], "bias", "none")
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        for module in self.model.modules():
+            if isinstance(module, Layers):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge layers when the model is gptq quantized")
+
+        def merge_recursively(module):
+            # helper function to recursively merge the base_layer of the target
+            path = []
+            layer = module
+            while hasattr(layer, "base_layer"):
+                path.append(layer)
+                layer = layer.base_layer
+            for layer_before, layer_after in zip(path[:-1], path[1:]):
+                layer_after.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                layer_before.base_layer = layer_after.base_layer
+            module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+
+        key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    merge_recursively(target)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                setattr(parent, target_name, target.modules_to_save[target.active_adapter])
+
+        return self.model
+
+    def add_weighted_adapter(self, *args: Any, **kwargs: Any) -> None:
+        raise NotImplementedError(f"Weighted adapters are not supported for {self.__class__.__name__} (yet).")
+
+    def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (Union[str, list[str]]): Name of the adapter(s) to delete.
+        """
+        if isinstance(adapter_name, str):
+            adapter_names = [adapter_name]
+        else:
+            adapter_names = adapter_name
+
+        mismatched = set(adapter_names) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        for adapter_name in adapter_names:
+            del self.peft_config[adapter_name]
+
+            key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)]
+            new_adapter = None
+            for key in key_list:
+                _, target, _ = _get_submodules(self.model, key)
+                if isinstance(target, BaseTunerLayer):
+                    target.delete_adapter(adapter_name)
+                    if new_adapter is None:
+                        new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> nn.Module:
+        r"""
+        This method merges the layers into the base model. This is needed if someone wants to use the base model as a
+        standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> nn.Module:
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
+
+    def generate(self, *args: Any, **kwargs: Any):
+        return self.model.generate(*args, **kwargs)
diff --git a/tests/test_mixed.py b/tests/test_mixed.py
new file mode 100644
index 0000000000..bd8f455e99
--- /dev/null
+++ b/tests/test_mixed.py
@@ -0,0 +1,794 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+import os
+import re
+import tempfile
+import unittest
+
+import torch
+from parameterized import parameterized
+from torch import nn
+from transformers import AutoModelForCausalLM
+
+from peft import AdaLoraConfig, LoHaConfig, LoKrConfig, LoraConfig, PeftMixedModel, PrefixTuningConfig, get_peft_model
+from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils import infer_device
+
+
+class SimpleNet(nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.lin0 = nn.Linear(10, 20, bias=bias)
+        self.relu = nn.ReLU()
+        self.lin1 = nn.Linear(20, 2, bias=bias)
+
+    def forward(self, X):
+        X = X.float()
+        X = self.lin0(X)
+        X = self.relu(X)
+        X = self.lin1(X)
+        return X
+
+
+def _param_name_func(testcase_func, param_num, params):
+    # for parameterized tests in TextMixedAdapterTypes
+    config0, config1 = params[0]
+    name0 = config0.__class__.__name__
+    name1 = config1.__class__.__name__
+    if name0 != name1:
+        return f"{testcase_func.__name__}_{param_num}_{name0}_{name1}"
+    return f"{testcase_func.__name__}_{param_num}_{name0}_x2"
+
+
+class TestMixedAdapterTypes(unittest.TestCase):
+    torch_device = infer_device()
+
+    def _get_model(self, model_cls, peft_config=None, adapter_name=None, seed=0, mixed=True):
+        torch.manual_seed(0)  # always use seed 0 for base model, seed for adapters may differ
+        base_model = model_cls().eval().to(self.torch_device)
+        if peft_config is None:
+            return base_model
+
+        torch.manual_seed(seed)
+        assert adapter_name is not None
+        peft_model = get_peft_model(base_model, peft_config, adapter_name=adapter_name, mixed=mixed)
+        return peft_model.eval().to(self.torch_device)
+
+    def _check_mixed_outputs(self, model_cls, config0, config1, input, *, is_commutative):
+        # This test checks different combinations of adapter0, adapter1, or combinations of the two, and whether
+        # outputs are the same/different, depending on context. If we pass is_commutative=True, it means that the order
+        # of adapters does not matter, and we expect the same output regardless of the order in which adapters are
+        # applied.
+        # We have to very careful with resetting the random seed each time it is used, otherwise the adapters may be
+        # initialized with different values, and the test will fail.
+
+        atol = 1e-5
+        rtol = 1e-5
+        seed0 = 0
+        seed1 = 1
+
+        # base model
+        base_model = self._get_model(model_cls)
+        output_base = base_model(input)
+        self.assertTrue(torch.isfinite(output_base).all())
+
+        # adapter 0
+        peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        output_config0 = peft_model_0(input)
+
+        self.assertTrue(torch.isfinite(output_config0).all())
+        self.assertFalse(torch.allclose(output_base, output_config0, atol=atol, rtol=rtol))
+
+        # adapter 1
+        peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1)
+        output_config1 = peft_model_1(input)
+
+        self.assertTrue(torch.isfinite(output_config1).all())
+        self.assertFalse(torch.allclose(output_base, output_config1, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_config0, output_config1, atol=atol, rtol=rtol))
+
+        # adapter 0 + 1
+        peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        torch.manual_seed(seed1)
+        peft_model_01.add_adapter("adapter1", config1)
+        peft_model_01.set_adapter(["adapter0", "adapter1"])
+        output_mixed_01 = peft_model_01(input)
+
+        # check the number of tuner layer types
+        tuner_layers = [mod for mod in peft_model_01.modules() if isinstance(mod, BaseTunerLayer)]
+        tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers}
+        if type(config0) == type(config1):
+            self.assertEqual(len(tuner_types), 1)
+        else:
+            self.assertEqual(len(tuner_types), 2)
+
+        self.assertEqual(peft_model_01.active_adapters, ["adapter0", "adapter1"])
+        self.assertTrue(torch.isfinite(output_mixed_01).all())
+        self.assertFalse(torch.allclose(output_config0, output_mixed_01, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_config1, output_mixed_01, atol=atol, rtol=rtol))
+        if is_commutative:
+            delta0 = output_config0 - output_base
+            delta1 = output_config1 - output_base
+            delta_mixed_01 = output_mixed_01 - output_base
+            self.assertTrue(torch.allclose(delta0 + delta1, delta_mixed_01, atol=atol, rtol=rtol))
+
+        # adapter 1 + 0
+        peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1)
+        torch.manual_seed(seed0)
+        peft_model_10.add_adapter("adapter0", config0)
+        peft_model_10.set_adapter(["adapter1", "adapter0"])
+        output_mixed_10 = peft_model_10(input)
+
+        # check the number of tuner layer types
+        tuner_layers = [mod for mod in peft_model_10.modules() if isinstance(mod, BaseTunerLayer)]
+        tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers}
+        if type(config0) == type(config1):
+            self.assertEqual(len(tuner_types), 1)
+        else:
+            self.assertEqual(len(tuner_types), 2)
+
+        self.assertEqual(peft_model_10.active_adapters, ["adapter1", "adapter0"])
+        self.assertTrue(torch.isfinite(output_mixed_10).all())
+        self.assertFalse(torch.allclose(output_config0, output_mixed_10, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_config1, output_mixed_10, atol=atol, rtol=rtol))
+        if is_commutative:
+            self.assertTrue(torch.allclose(output_mixed_01, output_mixed_10, atol=atol, rtol=rtol))
+
+        # turn around the order of the adapters of the 0 + 1 mixed model, should behave like the 0 + 1 mixed model
+        peft_model_10.set_adapter(["adapter0", "adapter1"])
+        output_mixed_reversed = peft_model_10(input)
+
+        # check the number of tuner layer types
+        tuner_layers = [mod for mod in peft_model_10.modules() if isinstance(mod, BaseTunerLayer)]
+        tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers}
+        if type(config0) == type(config1):
+            self.assertEqual(len(tuner_types), 1)
+        else:
+            self.assertEqual(len(tuner_types), 2)
+
+        self.assertEqual(peft_model_10.active_adapters, ["adapter0", "adapter1"])
+        self.assertTrue(torch.isfinite(output_mixed_reversed).all())
+        self.assertTrue(torch.allclose(output_mixed_reversed, output_mixed_01, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_mixed_reversed, output_config0, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_mixed_reversed, output_config1, atol=atol, rtol=rtol))
+        if is_commutative:
+            self.assertTrue(torch.allclose(output_mixed_reversed, output_mixed_10, atol=atol, rtol=rtol))
+
+    def _check_merging(self, model_cls, config0, config1, input):
+        # Ensure that when merging mixed adapters, the result is the same as when applying the adapters separately.
+        atol = 1e-5
+        rtol = 1e-5
+        seed0 = 0
+        seed1 = 1
+
+        # adapter 0 + 1
+        peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        torch.manual_seed(seed1)
+        peft_model_01.add_adapter("adapter1", config1)
+        peft_model_01.set_adapter(["adapter0", "adapter1"])
+        output_mixed_01 = peft_model_01(input)
+
+        model_merged_01 = peft_model_01.merge_and_unload()
+        output_merged_01 = model_merged_01(input)
+        self.assertTrue(torch.allclose(output_mixed_01, output_merged_01, atol=atol, rtol=rtol))
+
+        # adapter 1 + 0
+        peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1)
+        torch.manual_seed(seed0)
+        peft_model_10.add_adapter("adapter0", config0)
+        peft_model_10.set_adapter(["adapter1", "adapter0"])
+        output_mixed_10 = peft_model_10(input)
+
+        model_merged_10 = peft_model_10.merge_and_unload()
+        output_merged_10 = model_merged_10(input)
+        self.assertTrue(torch.allclose(output_mixed_10, output_merged_10, atol=atol, rtol=rtol))
+
+    def _check_unload(self, model_cls, config0, config1, input):
+        # Ensure that we can unload the base model without merging
+        atol = 1e-5
+        rtol = 1e-5
+        seed0 = 0
+        seed1 = 1
+
+        base_model = self._get_model(model_cls)
+        output_base = base_model(input)
+
+        # adapter 0 + 1
+        peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        torch.manual_seed(seed1)
+        peft_model_01.add_adapter("adapter1", config1)
+        peft_model_01.set_adapter(["adapter0", "adapter1"])
+        output_mixed = peft_model_01(input)
+
+        # unload
+        model_unloaded = peft_model_01.unload()
+        output_unloaded = model_unloaded(input)
+
+        self.assertFalse(torch.allclose(output_mixed, output_unloaded, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(output_base, output_unloaded, atol=atol, rtol=rtol))
+
+    def _check_disable(self, model_cls, config0, config1, input):
+        # Ensure that we can disable adapters
+        atol = 1e-5
+        rtol = 1e-5
+        seed0 = 0
+        seed1 = 1
+
+        # base model
+        base_model = self._get_model(model_cls)
+        output_base = base_model(input)
+
+        # adapter 0
+        peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        output_config0 = peft_model_0(input)
+        with peft_model_0.disable_adapter():
+            output_disabled0 = peft_model_0(input)
+
+        self.assertFalse(torch.allclose(output_base, output_config0, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(output_base, output_disabled0, atol=atol, rtol=rtol))
+
+        # adapter 1
+        peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1)
+        output_config1 = peft_model_1(input)
+        with peft_model_1.disable_adapter():
+            output_disabled1 = peft_model_1(input)
+
+        self.assertFalse(torch.allclose(output_base, output_config1, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(output_base, output_disabled1, atol=atol, rtol=rtol))
+
+        # adapter 0 + 1
+        peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+        torch.manual_seed(seed1)
+        peft_model_01.add_adapter("adapter1", config1)
+        peft_model_01.set_adapter(["adapter0", "adapter1"])
+        output_mixed_01 = peft_model_01(input)
+        with peft_model_01.disable_adapter():
+            output_disabled01 = peft_model_01(input)
+
+        self.assertFalse(torch.allclose(output_base, output_mixed_01, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(output_base, output_disabled01, atol=atol, rtol=rtol))
+
+        # adapter 1 + 0
+        peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1)
+        torch.manual_seed(seed0)
+        peft_model_10.add_adapter("adapter0", config0)
+        peft_model_10.set_adapter(["adapter1", "adapter0"])
+        output_mixed_10 = peft_model_10(input)
+        with peft_model_10.disable_adapter():
+            output_disabled10 = peft_model_10(input)
+
+        self.assertFalse(torch.allclose(output_base, output_mixed_10, atol=atol, rtol=rtol))
+        self.assertTrue(torch.allclose(output_base, output_disabled10, atol=atol, rtol=rtol))
+
+    def _check_loading(self, model_cls, config0, config1, input):
+        # Check that we can load two adapters into the same model
+        # Note that we save the adapters using a normal PeftModel because PeftMixModel doesn't support saving yet
+        atol = 1e-5
+        rtol = 1e-5
+        seed0 = 0
+        seed1 = 1
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            # SAVING
+            # adapter 0: note that we set mixed=False because mixed models don't support saving (yet)
+            peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0, mixed=False)
+            output_config0 = peft_model_0(input)
+            peft_model_0.save_pretrained(os.path.join(tmp_dirname, "adapter0"))
+
+            # adapter 1: note that we set mixed=False because mixed models don't support saving (yet)
+            peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1, mixed=False)
+            output_config1 = peft_model_1(input)
+            peft_model_1.save_pretrained(os.path.join(tmp_dirname, "adapter1"))
+
+            # adapter 0 + 1
+            peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0)
+            torch.manual_seed(seed1)
+            peft_model_01.add_adapter("adapter1", config1)
+            peft_model_01.set_adapter(["adapter0", "adapter1"])
+            output_mixed_01 = peft_model_01(input)
+
+            # LOADING
+            # adapter 0
+            base_model = self._get_model(model_cls)
+            # Notes:
+            # Path is tmp_dirname/adapter0/adapter0 because non-default adapters are saved in a subfolder.
+            # As a sanity check, we should set a completely different seed here. That way, we ensure that the the
+            # weights are not just randomly initialized exactly to the same values as before.
+            torch.manual_seed(123456)
+            peft_model_loaded0 = PeftMixedModel.from_pretrained(
+                base_model, os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0"
+            )
+            output_loaded0 = peft_model_loaded0(input)
+            self.assertTrue(torch.allclose(output_config0, output_loaded0, atol=atol, rtol=rtol))
+
+            # adapter 1
+            base_model = self._get_model(model_cls)
+            torch.manual_seed(654321)  # setting a completely different seed here should not affect the result
+            peft_model_loaded1 = PeftMixedModel.from_pretrained(
+                base_model, os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1"
+            )
+            output_loaded1 = peft_model_loaded1(input)
+            self.assertTrue(torch.allclose(output_config1, output_loaded1, atol=atol, rtol=rtol))
+
+            # adapter 0 + 1
+            base_model = self._get_model(model_cls)
+            torch.manual_seed(97531)  # setting a completely different seed here should not affect the result
+            peft_model_loaded_01 = PeftMixedModel.from_pretrained(
+                base_model, os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0"
+            )
+            peft_model_loaded_01.load_adapter(os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1")
+            # at this point, "config0" should still be active
+            self.assertEqual(peft_model_loaded_01.active_adapters, ["adapter0"])
+            output_loaded01_0 = peft_model_loaded_01(input)
+            self.assertTrue(torch.allclose(output_config0, output_loaded01_0, atol=atol, rtol=rtol))
+            # activate adapter1
+            peft_model_loaded_01.set_adapter(["adapter1"])
+            self.assertEqual(peft_model_loaded_01.active_adapters, ["adapter1"])
+            output_loaded01_1 = peft_model_loaded_01(input)
+            self.assertTrue(torch.allclose(output_config1, output_loaded01_1, atol=atol, rtol=rtol))
+            # activate both adapters
+            peft_model_loaded_01.set_adapter(["adapter0", "adapter1"])
+            output_loaded01 = peft_model_loaded_01(input)
+            self.assertTrue(torch.allclose(output_mixed_01, output_loaded01, atol=atol, rtol=rtol))
+
+            # adapter 1 + 0
+            base_model = self._get_model(model_cls)
+            torch.manual_seed(445566)  # setting a completely different seed here should not affect the result
+            peft_model_loaded_10 = PeftMixedModel.from_pretrained(
+                base_model, os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1"
+            )
+            peft_model_loaded_10.load_adapter(os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0")
+            # at this point, "config0" should still be active
+            self.assertEqual(peft_model_loaded_10.active_adapters, ["adapter1"])
+            output_loaded10_1 = peft_model_loaded_10(input)
+            self.assertTrue(torch.allclose(output_config1, output_loaded10_1, atol=atol, rtol=rtol))
+            # activate adapter1
+            peft_model_loaded_10.set_adapter(["adapter0"])
+            self.assertEqual(peft_model_loaded_10.active_adapters, ["adapter0"])
+            output_loaded10_0 = peft_model_loaded_10(input)
+            self.assertTrue(torch.allclose(output_config0, output_loaded10_0, atol=atol, rtol=rtol))
+            # activate both adapters
+            peft_model_loaded_10.set_adapter(["adapter1", "adapter0"])
+            output_loaded10 = peft_model_loaded_10(input)
+            self.assertTrue(torch.allclose(output_mixed_01, output_loaded10, atol=atol, rtol=rtol))
+
+    @parameterized.expand(
+        itertools.combinations(
+            [
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+            ],
+            r=2,
+        ),
+        name_func=_param_name_func,
+    )
+    def test_target_first_layer(self, config0, config1):
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False)
+        self._check_merging(SimpleNet, config0, config1, input)
+        self._check_unload(SimpleNet, config0, config1, input)
+        self._check_disable(SimpleNet, config1, config0, input)
+        self._check_loading(SimpleNet, config0, config1, input)
+
+    @parameterized.expand(
+        itertools.combinations(
+            [
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ],
+            r=2,
+        ),
+        name_func=_param_name_func,
+    )
+    def test_target_last_layer(self, config0, config1):
+        # We are targeting the last layer of the SimpleNet. Therefore, since the adapters only add their activations
+        # to the output, the results should be commutative. This would *not* work if the adapters do something more
+        # complex or if we target an earlier layer, because of the non-linearity would destroy the commutativity.
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True)
+        self._check_merging(SimpleNet, config0, config1, input)
+        self._check_unload(SimpleNet, config0, config1, input)
+        self._check_disable(SimpleNet, config1, config0, input)
+        self._check_loading(SimpleNet, config0, config1, input)
+
+    @parameterized.expand(
+        [
+            (
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+            ),
+        ],
+        name_func=_param_name_func,
+    )
+    def test_target_different_layers(self, config0, config1):
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False)
+        self._check_merging(SimpleNet, config0, config1, input)
+        self._check_unload(SimpleNet, config0, config1, input)
+        self._check_disable(SimpleNet, config1, config0, input)
+        self._check_loading(SimpleNet, config0, config1, input)
+
+    @parameterized.expand(
+        [
+            (
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+                LoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+            (
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+                LoHaConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+                LoKrConfig(target_modules=["lin1"], init_weights=False),
+            ),
+            (
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+                AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False),
+            ),
+        ],
+        name_func=_param_name_func,
+    )
+    def test_target_last_layer_same_type(self, config0, config1):
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True)
+        self._check_merging(SimpleNet, config0, config1, input)
+        self._check_unload(SimpleNet, config0, config1, input)
+        self._check_disable(SimpleNet, config1, config0, input)
+
+    @parameterized.expand(
+        [
+            (
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                LoraConfig(target_modules=["lin0"], init_lora_weights=False),
+            ),
+            (
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+                LoHaConfig(target_modules=["lin0"], init_weights=False),
+            ),
+            (
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+                LoKrConfig(target_modules=["lin0"], init_weights=False),
+            ),
+            (
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+                AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
+            ),
+        ],
+        name_func=_param_name_func,
+    )
+    def test_target_first_layer_same_type(self, config0, config1):
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False)
+        self._check_merging(SimpleNet, config0, config1, input)
+        self._check_unload(SimpleNet, config0, config1, input)
+        self._check_disable(SimpleNet, config1, config0, input)
+        self._check_loading(SimpleNet, config0, config1, input)
+
+    def test_deeply_nested(self):
+        # a somewhat absurdly nested model using different adapter types
+        atol = 1e-5
+        rtol = 1e-5
+        torch.manual_seed(0)
+
+        model = SimpleNet().eval().to(self.torch_device)
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        output_base = model(input)
+
+        config0 = LoraConfig(r=4, lora_alpha=4, target_modules=["lin0", "lin1"], init_lora_weights=False)
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+
+        config1 = LoHaConfig(r=4, alpha=4, target_modules=["lin0"], init_weights=False)
+        peft_model.add_adapter("adapter1", config1)
+
+        config2 = AdaLoraConfig(r=4, lora_alpha=4, target_modules=["lin1"], init_lora_weights=False)
+        peft_model.add_adapter("adapter2", config2)
+
+        config3 = LoKrConfig(r=4, alpha=4, target_modules=["lin0", "lin1"], init_weights=False)
+        peft_model.add_adapter("adapter3", config3)
+
+        config4 = LoraConfig(r=4, lora_alpha=4, target_modules=["lin0", "lin1"], init_lora_weights=False)
+        peft_model.add_adapter("adapter4", config4)
+
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3", "adapter4"])
+        output_mixed = peft_model(input)
+        self.assertTrue(torch.isfinite(output_base).all())
+        self.assertFalse(torch.allclose(output_base, output_mixed, atol=atol, rtol=rtol))
+
+        # test disabling all adapters
+        with peft_model.disable_adapter():
+            output_disabled = peft_model(input)
+        self.assertTrue(torch.isfinite(output_disabled).all())
+        self.assertTrue(torch.allclose(output_base, output_disabled, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_mixed, output_disabled, atol=atol, rtol=rtol))
+
+        # merge and unload all adapters
+        model_copy = copy.deepcopy(peft_model)
+        model = model_copy.merge_and_unload()
+        output_merged = model(input)
+        self.assertTrue(torch.isfinite(output_merged).all())
+        self.assertTrue(torch.allclose(output_mixed, output_merged, atol=atol, rtol=rtol))
+
+        # merge and unload only adapter1 and adapter3
+        model_copy = copy.deepcopy(peft_model)
+        model_copy.set_adapter(["adapter1", "adapter3"])
+        output_13 = model_copy(input)
+        self.assertTrue(torch.isfinite(output_13).all())
+        self.assertFalse(torch.allclose(output_mixed, output_13, atol=atol, rtol=rtol))
+
+        model_copy.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3", "adapter4"])
+        model_merged_unloaded = model_copy.merge_and_unload(adapter_names=["adapter1", "adapter3"])
+        output_merged_13 = model_merged_unloaded(input)
+        self.assertTrue(torch.isfinite(output_merged_13).all())
+        self.assertTrue(torch.allclose(output_13, output_merged_13, atol=atol, rtol=rtol))
+
+        # test unloading
+        model_copy = copy.deepcopy(peft_model)
+        model_unloaded = model_copy.unload()
+        output_unloaded = model_unloaded(input)
+        self.assertTrue(torch.isfinite(output_unloaded).all())
+        self.assertTrue(torch.allclose(output_base, output_unloaded, atol=atol, rtol=rtol))
+
+    def test_delete_adapter(self):
+        atol = 1e-5
+        rtol = 1e-5
+        torch.manual_seed(0)
+
+        model = SimpleNet().eval().to(self.torch_device)
+        input = torch.arange(90).reshape(9, 10).to(self.torch_device)
+        output_base = model(input)
+
+        # create adapter0
+        torch.manual_seed(0)
+        config0 = LoraConfig(r=4, lora_alpha=4, target_modules=["lin0", "lin1"], init_lora_weights=False)
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+        output_0 = peft_model(input)
+        self.assertFalse(torch.allclose(output_base, output_0, atol=atol, rtol=rtol))
+
+        # add adapter1
+        torch.manual_seed(1)
+        config1 = LoHaConfig(r=4, alpha=4, target_modules=["lin0"], init_weights=False)
+        peft_model.add_adapter("adapter1", config1)
+        peft_model.set_adapter(["adapter0", "adapter1"])
+        output_01 = peft_model(input)
+        self.assertFalse(torch.allclose(output_base, output_01, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_0, output_01, atol=atol, rtol=rtol))
+
+        # delete adapter1
+        peft_model.delete_adapter("adapter1")
+        self.assertEqual(peft_model.active_adapters, ["adapter0"])
+        output_deleted_1 = peft_model(input)
+        self.assertTrue(torch.allclose(output_0, output_deleted_1, atol=atol, rtol=rtol))
+
+        msg = re.escape("Adapter(s) ['adapter1'] not found, available adapters: ['adapter0']")
+        with self.assertRaisesRegex(ValueError, expected_regex=msg):
+            peft_model.set_adapter(["adapter0", "adapter1"])
+
+        # re-add adapter1
+        torch.manual_seed(1)
+        peft_model.add_adapter("adapter1", config1)
+        peft_model.set_adapter(["adapter0", "adapter1"])
+        output_01_readded = peft_model(input)
+        self.assertFalse(torch.allclose(output_base, output_01_readded, atol=atol, rtol=rtol))
+
+        # same as above, but this time delete adapter0 first
+        torch.manual_seed(0)
+        model = SimpleNet().eval().to(self.torch_device)
+        torch.manual_seed(0)
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+        torch.manual_seed(1)
+        peft_model.add_adapter("adapter1", config1)
+        peft_model.delete_adapter("adapter0")
+        self.assertEqual(peft_model.active_adapters, ["adapter1"])
+        output_deleted_0 = peft_model(input)
+        self.assertFalse(torch.allclose(output_deleted_0, output_base, atol=atol, rtol=rtol))
+        self.assertFalse(torch.allclose(output_deleted_0, output_01, atol=atol, rtol=rtol))
+
+        msg = re.escape("Adapter(s) ['adapter0'] not found, available adapters: ['adapter1']")
+        with self.assertRaisesRegex(ValueError, expected_regex=msg):
+            peft_model.set_adapter(["adapter0", "adapter1"])
+
+        peft_model.delete_adapter("adapter1")
+        self.assertEqual(peft_model.active_adapters, [])
+        output_deleted_01 = peft_model(input)
+        self.assertTrue(torch.allclose(output_deleted_01, output_base, atol=atol, rtol=rtol))
+
+    def test_modules_to_save(self):
+        model = SimpleNet().eval().to(self.torch_device)
+        config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+
+        # adding a second adapter with same modules_to_save is not allowed
+        # TODO: theoretically, we could allow this if it's the same target layer
+        config1 = LoHaConfig(target_modules=["lin0"], modules_to_save=["lin1"])
+        peft_model.add_adapter("adapter1", config1)
+        msg = "Only one adapter can be set at a time for modules_to_save"
+        with self.assertRaisesRegex(ValueError, expected_regex=msg):
+            peft_model.set_adapter(["adapter0", "adapter1"])
+
+    def test_get_nb_trainable_parameters(self):
+        model = SimpleNet().eval().to(self.torch_device)
+        config0 = LoraConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+        trainable_params0, all_param0 = peft_model.get_nb_trainable_parameters()
+
+        params_base = 262
+        params_lora = sum(p.numel() for n, p in model.named_parameters() if "adapter0" in n)
+        self.assertEqual(trainable_params0, params_lora)
+        self.assertEqual(all_param0, params_base + params_lora)
+
+        config1 = LoHaConfig(target_modules=["lin1"])
+        peft_model.add_adapter("adapter1", config1)
+        peft_model.set_adapter(["adapter0", "adapter1"])
+        params_loha = sum(p.numel() for n, p in model.named_parameters() if "adapter1" in n)
+        trainable_params1, all_param1 = peft_model.get_nb_trainable_parameters()
+        self.assertEqual(trainable_params1, params_lora + params_loha)
+        self.assertEqual(all_param1, params_base + params_lora + params_loha)
+
+        config2 = AdaLoraConfig(target_modules=["lin0", "lin1"])
+        peft_model.add_adapter("adapter2", config2)
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2"])
+        params_adalora = sum(p.numel() for n, p in model.named_parameters() if "adapter2" in n)
+        trainable_params2, all_param2 = peft_model.get_nb_trainable_parameters()
+        # remove 2 params because we need to exclude "ranknum" for AdaLora trainable params
+        self.assertEqual(trainable_params2, params_lora + params_loha + params_adalora - 2)
+        self.assertEqual(all_param2, params_base + params_lora + params_loha + params_adalora)
+
+    def test_incompatible_config_raises(self):
+        model = SimpleNet().eval().to(self.torch_device)
+        config0 = LoraConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+
+        config1 = PrefixTuningConfig()
+        msg = "The provided `peft_type` 'PREFIX_TUNING' is not compatible with the `PeftMixedModel`."
+        with self.assertRaisesRegex(ValueError, expected_regex=msg):
+            peft_model.add_adapter("adapter1", config1)
+
+    def test_decoder_model(self):
+        # test a somewhat realistic model instead of a toy model
+        torch.manual_seed(0)
+
+        model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
+        model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device)
+        input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device)
+        attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device)
+        input_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        output_base = model.generate(**input_dict)
+
+        torch.manual_seed(0)
+        config0 = LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False)
+        peft_model = get_peft_model(model, config0, "adapter0", mixed=True)
+        output0 = peft_model.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output0).all())
+        self.assertFalse(torch.allclose(output_base, output0))
+
+        torch.manual_seed(1)
+        config1 = LoHaConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_weights=False)
+        peft_model.add_adapter("adapter1", config1)
+        peft_model.set_adapter(["adapter0", "adapter1"])
+        output1 = peft_model.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output1).all())
+        self.assertFalse(torch.allclose(output0, output1))
+
+        torch.manual_seed(2)
+        config2 = AdaLoraConfig(task_type="CAUSAL_LM", init_lora_weights=False)
+        peft_model.add_adapter("adapter2", config2)
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2"])
+        output2 = peft_model.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output2).all())
+        self.assertFalse(torch.allclose(output1, output2))
+
+        torch.manual_seed(3)
+        config3 = LoKrConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_weights=False)
+        peft_model.add_adapter("adapter3", config3)
+        peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"])
+        output3 = peft_model.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output3).all())
+        self.assertFalse(torch.allclose(output2, output3))
+
+        with peft_model.disable_adapter():
+            output_disabled = peft_model.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output_disabled).all())
+        self.assertTrue(torch.allclose(output_base, output_disabled))
+
+        model_unloaded = peft_model.merge_and_unload()
+        output_unloaded = model_unloaded.generate(**input_dict)
+        self.assertTrue(torch.isfinite(output_unloaded).all())
+        self.assertTrue(torch.allclose(output3, output_unloaded))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # save adapter0 (use normal PeftModel, because PeftMixedModel does not support saving)
+            torch.manual_seed(0)
+            model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device)
+            torch.manual_seed(0)
+            peft_model = get_peft_model(model, config0, "adapter0")
+            output0_save = peft_model(**input_dict).logits
+            self.assertTrue(torch.isfinite(output0_save).all())
+            peft_model.save_pretrained(tmp_dir)
+
+            # save adapter1
+            torch.manual_seed(0)
+            model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device)
+            torch.manual_seed(1)
+            peft_model = get_peft_model(model, config1, "adapter1")
+            output1_save = peft_model(**input_dict).logits
+            self.assertTrue(torch.isfinite(output1_save).all())
+            peft_model.save_pretrained(tmp_dir)
+
+            # load adapter0 and adapter1
+            model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device)
+            peft_model = PeftMixedModel.from_pretrained(model, os.path.join(tmp_dir, "adapter0"), "adapter0")
+            peft_model.load_adapter(os.path.join(tmp_dir, "adapter1"), "adapter1")
+            peft_model.set_adapter(["adapter0", "adapter1"])
+            output01_loaded = peft_model(**input_dict).logits
+
+            atol, rtol = 1e-3, 1e-3
+            self.assertTrue(torch.isfinite(output01_loaded).all())
+            self.assertFalse(torch.allclose(output0_save, output01_loaded, atol=atol, rtol=rtol))
+            self.assertFalse(torch.allclose(output1_save, output01_loaded, atol=atol, rtol=rtol))