From 04c411010b14492b5bc92e3f13108d98ee0e26e2 Mon Sep 17 00:00:00 2001
From: Jialong Wu <wujialongml@gmail.com>
Date: Tue, 28 Nov 2023 23:04:52 +0800
Subject: [PATCH 1/6] Examples: add options to save or push model (#1159)

---
 ...a_clm_accelerate_big_model_inference.ipynb | 18 +++++++++-
 ...ft_lora_clm_accelerate_ds_zero3_offload.py | 19 +++++++---
 .../peft_prefix_tuning_clm.ipynb              | 35 +++++++++++++++++--
 .../peft_prompt_tuning_clm.ipynb              | 35 +++++++++++++++++--
 ...ora_seq2seq_accelerate_ds_zero3_offload.py | 20 ++++++++---
 .../peft_lora_seq2seq_accelerate_fsdp.py      | 18 +++++++---
 6 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
index 5d05e1441a..3e2493a391 100644
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_big_model_inference.ipynb
@@ -210,6 +210,23 @@
     "print(next(iter(test_dataloader)))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "42b14a11",
+   "metadata": {},
+   "source": [
+    "You can load model from hub or local\n",
+    "\n",
+    "- Load model from Hugging Face Hub, you can change to your own model id\n",
+    "```python\n",
+    "peft_model_id = \"username/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
+    "```\n",
+    "- Or load model form local\n",
+    "```python\n",
+    "peft_model_id = \"twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -244,7 +261,6 @@
     "\n",
     "max_memory = {0: \"1GIB\", 1: \"1GIB\", 2: \"2GIB\", 3: \"10GIB\", \"cpu\": \"30GB\"}\n",
     "peft_model_id = \"smangrul/twitter_complaints_bigscience_bloomz-7b1_LORA_CAUSAL_LM\"\n",
-    "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map=\"auto\", max_memory=max_memory)\n",
     "model = PeftModel.from_pretrained(model, peft_model_id, device_map=\"auto\", max_memory=max_memory)"
diff --git a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
index 224d17d813..2b7fcf23b5 100644
--- a/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
+++ b/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py
@@ -349,12 +349,21 @@ def test_preprocess_function(examples):
         pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
 
     accelerator.wait_for_everyone()
-    model.push_to_hub(
-        "smangrul/"
-        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-        state_dict=accelerator.get_state_dict(model),
-        use_auth_token=True,
+    # Option1: Pushing the model to Hugging Face Hub
+    # model.push_to_hub(
+    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+    #     token = "hf_..."
+    # )
+    # token (`bool` or `str`, *optional*):
+    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+    #     is not specified.
+    #     Or you can get your token from https://huggingface.co/settings/token
+    # Option2: Saving the model locally
+    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
+        "/", "_"
     )
+    model.save_pretrained(peft_model_id)
     accelerator.wait_for_everyone()
 
 
diff --git a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
index ef0109bf4a..607b5291a3 100644
--- a/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
@@ -1228,6 +1228,33 @@
     "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0e21c49b",
+   "metadata": {},
+   "source": [
+    "You can push model to hub or save model locally. \n",
+    "\n",
+    "- Option1: Pushing the model to Hugging Face Hub\n",
+    "```python\n",
+    "model.push_to_hub(\n",
+    "    f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n",
+    "    token = \"hf_...\"\n",
+    ")\n",
+    "```\n",
+    "token (`bool` or `str`, *optional*):\n",
+    "    `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n",
+    "    when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n",
+    "    is not specified.\n",
+    "    Or you can get your token from https://huggingface.co/settings/token\n",
+    "```\n",
+    "- Or save model locally\n",
+    "```python\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n",
+    "model.save_pretrained(peft_model_id)\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 16,
@@ -1236,7 +1263,9 @@
    "outputs": [],
    "source": [
     "# saving model\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "model.save_pretrained(peft_model_id)"
    ]
   },
@@ -1260,7 +1289,9 @@
    "source": [
     "from peft import PeftModel, PeftConfig\n",
     "\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n",
diff --git a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
index b0a7e26689..948244b671 100644
--- a/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
+++ b/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb
@@ -1072,6 +1072,33 @@
     "    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c8f35152",
+   "metadata": {},
+   "source": [
+    "You can push model to hub or save model locally. \n",
+    "\n",
+    "- Option1: Pushing the model to Hugging Face Hub\n",
+    "```python\n",
+    "model.push_to_hub(\n",
+    "    f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n",
+    "    token = \"hf_...\"\n",
+    ")\n",
+    "```\n",
+    "token (`bool` or `str`, *optional*):\n",
+    "    `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n",
+    "    when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n",
+    "    is not specified.\n",
+    "    Or you can get your token from https://huggingface.co/settings/token\n",
+    "```\n",
+    "- Or save model locally\n",
+    "```python\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n",
+    "model.save_pretrained(peft_model_id)\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -1080,7 +1107,9 @@
    "outputs": [],
    "source": [
     "# saving model\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "model.save_pretrained(peft_model_id)"
    ]
   },
@@ -1116,7 +1145,9 @@
    "source": [
     "from peft import PeftModel, PeftConfig\n",
     "\n",
-    "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
+    "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n",
+    "    \"/\", \"_\"\n",
+    ")\n",
     "\n",
     "config = PeftConfig.from_pretrained(peft_model_id)\n",
     "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n",
diff --git a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
index 36df125cc5..dc202580b0 100644
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py
@@ -298,12 +298,22 @@ def collate_fn(examples):
         pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False)
 
     accelerator.wait_for_everyone()
-    model.push_to_hub(
-        "smangrul/"
-        + f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-        state_dict=accelerator.get_state_dict(model),
-        use_auth_token=True,
+    # Option1: Pushing the model to Hugging Face Hub
+    # model.push_to_hub(
+    #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+    #     token = "hf_..."
+    # )
+    # token (`bool` or `str`, *optional*):
+    #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+    #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+    #     is not specified.
+    #     Or you can get your token from https://huggingface.co/settings/token
+
+    # Option2: Saving the model locally
+    peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace(
+        "/", "_"
     )
+    model.save_pretrained(peft_model_id)
     accelerator.wait_for_everyone()
 
 
diff --git a/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
index c2146a5dce..9c60fd8057 100644
--- a/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
+++ b/examples/conditional_generation/peft_lora_seq2seq_accelerate_fsdp.py
@@ -125,11 +125,19 @@ def preprocess_function(examples):
         accelerator.print(f"{eval_preds[:10]=}")
         accelerator.print(f"{dataset['validation'][label_column][:10]=}")
         accelerator.wait_for_everyone()
-        model.push_to_hub(
-            "smangrul/" + f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
-            state_dict=accelerator.get_state_dict(model),
-            use_auth_token=True,
-        )
+        # Option1: Pushing the model to Hugging Face Hub
+        # model.push_to_hub(
+        #     f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"),
+        #     token = "hf_..."
+        # )
+        # token (`bool` or `str`, *optional*):
+        #     `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated
+        #     when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+        #     is not specified.
+        #     Or you can get your token from https://huggingface.co/settings/token
+        # Option2: Saving the model locally
+        peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_")
+        model.save_pretrained(peft_model_id)
         accelerator.wait_for_everyone()
 
 

From f0fb9516d8ebc4fed1bacaa11b789be023f003f8 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
Date: Wed, 29 Nov 2023 12:37:39 +0100
Subject: [PATCH 2/6] ENH: Different initialization methods for LoRA (#1189)

This PR adds the possibility to use different initialization methods for
LoRA, as is a requirement for a completely backwards compatible adoption
of PEFT in diffusers.

The default is still the same as always, namely the one from the
reference implementation by Microsoft. On top of that, it is now
possible to pass `init_lora_weights='gaussian'` to initialize the LoRA
weights in the same way as is default for diffusers, namely with a
normal distribution which is scaled by 1/r.

The init method currently applies to LoRA linear and conv layers, but
not embedding layers, which are always initialized from a normal
distribution (and are probably irrelevant for diffusers).

In the future, similar extensions could be added for other adapter
methods.
---
 setup.py                       |   4 +-
 src/peft/tuners/lora/config.py |  12 +-
 src/peft/tuners/lora/layer.py  |  22 +++-
 tests/test_initialization.py   | 232 +++++++++++++++++++++++++++++++++
 4 files changed, 258 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_initialization.py

diff --git a/setup.py b/setup.py
index 8e3d60ec7c..7f5e55524f 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,9 @@
 extras["quality"] = ["black ~= 22.0", "ruff>=0.0.241", "urllib3<=2.0.0"]
 extras["docs_specific"] = ["hf-doc-builder"]
 extras["dev"] = extras["quality"] + extras["docs_specific"]
-extras["test"] = extras["dev"] + ["pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers<0.21.0"]
+extras["test"] = extras["dev"] + [
+    "pytest", "pytest-cov", "pytest-xdist", "parameterized", "datasets", "diffusers<0.21.0", "scipy"
+]
 
 setup(
     name="peft",
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 2412b61a1a..b1e31d8198 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import List, Literal, Optional, Union
 
 from peft.config import PeftConfig
 from peft.utils import PeftType
@@ -76,12 +78,14 @@ class LoraConfig(PeftConfig):
             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
         },
     )
-    init_lora_weights: bool = field(
+    init_lora_weights: bool | Literal["gaussian"] = field(
         default=True,
         metadata={
             "help": (
-                "Whether to initialize the weights of the Lora layers with their default initialization. Don't change "
-                "this setting, except if you know exactly what you're doing."
+                "How to initialize the weights of the LoRA layers. Passing True (default) results in the default "
+                "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
+                "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
+                "to False leads to completely random initialization and is discouraged."
             ),
         },
     )
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index c263053183..5ea726d2ff 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -84,7 +84,7 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
             self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
             self.scaling[adapter_name] = lora_alpha / r
         if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(self.get_base_layer(), "weight", None)
         if weight is not None:
@@ -116,7 +116,7 @@ def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lo
             self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
             self.scaling[adapter_name] = lora_alpha / r
         if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(base_layer, "weight", None)
         if weight is not None:
@@ -142,8 +142,7 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
             self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
-            self.reset_lora_parameters(adapter_name)
+        self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         base_layer = self.get_base_layer()
         weight = getattr(base_layer, "weight", None)
@@ -152,10 +151,19 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.to(base_layer.weight.device, dtype=weight.dtype)
         self.set_adapter(self.active_adapters)
 
-    def reset_lora_parameters(self, adapter_name):
+    def reset_lora_parameters(self, adapter_name, init_lora_weights):
+        if init_lora_weights is False:
+            return
+
         if adapter_name in self.lora_A.keys():
-            # initialize A the same way as the default for nn.Linear and B to zero
-            nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            if init_lora_weights is True:
+                # initialize A the same way as the default for nn.Linear and B to zero
+                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
+                nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            elif init_lora_weights.lower() == "gaussian":
+                nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
+            else:
+                raise ValueError(f"Unknown initialization {init_lora_weights=}")
             nn.init.zeros_(self.lora_B[adapter_name].weight)
         if adapter_name in self.lora_embedding_A.keys():
             # initialize a the same way as the default for nn.linear and b to zero
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
new file mode 100644
index 0000000000..3770b4a74f
--- /dev/null
+++ b/tests/test_initialization.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from scipy import stats
+from torch import nn
+
+from peft import LoraConfig, get_peft_model
+from peft.utils import infer_device
+
+
+class InitializationTest(unittest.TestCase):
+    """Test class to check the initialization of adapters."""
+
+    torch_device = infer_device()
+
+    def get_uniform(self, amin, amax, size=(10000,)):
+        unif = torch.distributions.uniform.Uniform(amin, amax)
+        samples = unif.sample(size)
+        return samples
+
+    def get_normal(self, mean, std, size=(10000,)):
+        normal = torch.distributions.normal.Normal(mean, std)
+        samples = normal.sample(size)
+        return samples
+
+    def get_model(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # choose a large weight so that averages are close to expected values
+                self.linear = nn.Linear(1000, 1000)
+                self.embed = nn.Embedding(1000, 1000)
+                self.conv2d = nn.Conv2d(100, 100, 3)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        return MyModule().eval().to(self.torch_device)
+
+    def test_lora_linear_init_default(self):
+        # default is True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"])
+        model = get_peft_model(model, config)
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a normal distribution
+        normal = self.get_normal(weight_A.mean().item(), weight_A.std().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_linear_init_gaussian(self):
+        # use gaussian init
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a normal distribution
+        normal = self.get_normal(0.0, 1 / config.r)
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+
+        # import matplotlib.pyplot as plt
+        # x = weight_A.detach().flatten().cpu().numpy()
+        # breakpoint()
+
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_linear_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_B = model.linear.lora_B["default"].weight
+
+        # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_B, torch.zeros_like(weight_B)))
+
+    def test_lora_embedding_default(self):
+        # embedding is initialized as a normal distribution, not kaiming uniform
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"])
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_A["default"]
+        weight_B = model.embed.lora_embedding_B["default"]
+
+        # use statistical test to check if weight B is from a normal distribution
+        normal = self.get_normal(0.0, 1.0)
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight B is *not* from a uniform distribution
+        unif = self.get_uniform(weight_B.min().item(), weight_B.max().item())
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight A is zero
+        self.assertTrue((weight_A == 0.0).all())
+
+    def test_lora_embedding_gaussian(self):
+        # embedding does not change with init_lora_weights="gaussian" vs True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_A["default"]
+        weight_B = model.embed.lora_embedding_B["default"]
+
+        # use statistical test to check if weight B is from a normal distribution
+        normal = self.get_normal(0.0, 1.0)
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight B is *not* from a uniform distribution
+        unif = self.get_uniform(weight_B.min().item(), weight_B.max().item())
+        _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight A is zero
+        self.assertTrue((weight_A == 0.0).all())
+
+    def test_lora_embedding_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["embed"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_A = model.embed.lora_embedding_B["default"]
+
+        # with init_lora_weights=False, weight A should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_A, torch.zeros_like(weight_A)))
+
+    def test_lora_conv2d_default(self):
+        # default is True
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"])
+        model = get_peft_model(model, config)
+        weight_A = model.conv2d.lora_A["default"].weight
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a normal distribution
+        normal = self.get_normal(weight_A.mean().item(), weight_A.std().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_conv2d_init_gaussian(self):
+        # use gaussian init
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"], init_lora_weights="gaussian")
+        model = get_peft_model(model, config)
+        weight_A = model.conv2d.lora_A["default"].weight
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # use statistical test to check if weight A is from a normal distribution
+        normal = self.get_normal(0.0, 1 / config.r)
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy())
+        self.assertGreater(p_value, 0.5)
+
+        # check that weight A is *not* from a uniform distribution
+        unif = self.get_uniform(weight_A.min().item(), weight_A.max().item())
+        _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy())
+        self.assertLess(p_value, 0.05)
+
+        # check that weight B is zero
+        self.assertTrue((weight_B == 0.0).all())
+
+    def test_lora_conv2d_false(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["conv2d"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+        weight_B = model.conv2d.lora_B["default"].weight
+
+        # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values
+        # as long as they are not zero, in order to avoid identity transformation.
+        self.assertFalse(torch.allclose(weight_B, torch.zeros_like(weight_B)))

From 8298f1a3668604ac9bc3f6e28b24e8eb554891a1 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:28:41 +0530
Subject: [PATCH 3/6] Training PEFT models with new tokens being added to the
 embedding layers and tokenizer (#1147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add support for saving base layers weights along with adapter weights

* Update save_and_load.py

* Add an example showing the usage of the added feature

* refactor the functionality

* fix

* refactoring code

1. Add `is_embedding_layer_resized` parameter to `save_pretrained`
2. Fix the deduplication in README when adding PEFT details.
3. `save_pretrained` should only save the model when `is_main_process=True` which is one of the parameters of `save_pretrained`.

* update example

* fix the model card

* fix model card

* 😅

* fix model card

* automate setting `is_embedding_layer_resized`

* nits

* Update peft_lora_clm_with_additional_tokens.ipynb

* add test

* fix tests

* maybe fixes the issue?

* address comments

Co-Authored-By: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 ...peft_lora_clm_with_additional_tokens.ipynb | 1012 +++++++++++++++++
 src/peft/peft_model.py                        |   52 +-
 src/peft/utils/other.py                       |    1 +
 src/peft/utils/save_and_load.py               |   45 +-
 tests/test_custom_models.py                   |   76 ++
 5 files changed, 1167 insertions(+), 19 deletions(-)
 create mode 100644 examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb

diff --git a/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb b/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
new file mode 100644
index 0000000000..81762de08c
--- /dev/null
+++ b/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb
@@ -0,0 +1,1012 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5f239612-620e-4430-8685-9fdc6b179b41",
+   "metadata": {},
+   "source": [
+    "# Training PEFT models with new tokens being added to the embedding layers and tokenizer\n",
+    "\n",
+    "In this example, we will learn how to train a LoRA model when adding new tokens to the tokenizer and model. \n",
+    "This is a common usecase when doing the following:\n",
+    "1. Instruction finetuning with new tokens beind added such as `<|user|>`, `<|assistant|>`, `<|system|>`, `</s>`, `<s>` to properly format the conversations\n",
+    "2. Finetuning on a specific language wherein language spoecific tokens are added, e.g., korean tokens being added to vocabulary for finetuning LLM on Korean datasets.\n",
+    "3. Instruction finetuning to return outputs in certain format to enable agent behaviour new tokens such as `<|FUNCTIONS|>`, `<|BROWSE|>`, `<|TEXT2IMAGE|>`, `<|ASR|>`, `<|TTS|>`, `<|GENERATECODE|>`, `<|RAG|>`.\n",
+    "\n",
+    "In such cases, you add the Embedding modules to the LORA `target_modules`. PEFT will take care of saving the embedding layers with the new added tokens along with the adapter weights that were trained on the specific initialization of the embeddings weights of the added tokens."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b27c55e8-edaa-4059-90bc-d6096d596902",
+   "metadata": {},
+   "source": [
+    "Let's import the necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6f864c90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
+    "os.environ[\"WANDB_PROJECT\"] = \"PeftExamples\"\n",
+    "import transformers\n",
+    "from peft import (\n",
+    "    LoraConfig,\n",
+    "    PeftConfig,\n",
+    "    PeftModel,\n",
+    "    get_peft_model,\n",
+    "    prepare_model_for_int8_training,\n",
+    ")\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    HfArgumentParser,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    "    default_data_collator,\n",
+    ")\n",
+    "import torch\n",
+    "from dataclasses import dataclass, field\n",
+    "from typing import Optional\n",
+    "from dataclass_csv import DataclassReader\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "from enum import Enum"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74950a3f-bb63-4ce5-9e2b-1b83f92b13a2",
+   "metadata": {},
+   "source": [
+    "## Prepare Model and Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76763f5e-64b2-409b-8845-ae5589f8a4e0",
+   "metadata": {},
+   "source": [
+    "Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fd0498ea-547e-418d-bf13-c9abafdd5476",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SpecialTokens(str, Enum):\n",
+    "    begin_target = \"<|begintarget|>\"\n",
+    "    end_target = \"<|endtarget|>\"\n",
+    "    begin_context = \"<|begincontext|>\"\n",
+    "    end_context = \"<|endcontext|>\"\n",
+    "    system = \"<|system|>\"\n",
+    "    user = \"<|user|>\"\n",
+    "    begin_last_user_utterance = \"<|beginlastuserutterance|>\"\n",
+    "    end_last_user_utterance = \"<|endlastuserutterance|>\"\n",
+    "    begin_dsts = \"<|begindsts|>\"\n",
+    "    end_dsts = \"<|enddsts|>\"\n",
+    "    begin_dst = \"<|begindst|>\"\n",
+    "    end_dst = \"<|enddst|>\"\n",
+    "    begin_belief = \"<|beginbelief|>\"\n",
+    "    end_belief = \"<|endbelief|>\"\n",
+    "    begin_response = \"<|beginresponse|>\"\n",
+    "    end_response = \"<|endresponse|>\"\n",
+    "    begin_action = \"<|beginaction|>\"\n",
+    "    end_action = \"<|endaction|>\"\n",
+    "    begin_user_action = \"<|beginuseraction|>\"\n",
+    "    end_user_action = \"<|enduseraction|>\"\n",
+    "    sys_actions = \"<|sysactions|>\"\n",
+    "    begin_intent = \"<|beginintent|>\"\n",
+    "    end_intent = \"<|endintent|>\"\n",
+    "    begin_requested_slots = \"<|beginrequestedslots|>\"\n",
+    "    end_requested_slots = \"<|endrequestedslots|>\"\n",
+    "    pad_token = \"<|pad|>\"\n",
+    "    bos_token = \"<|startoftext|>\"\n",
+    "\n",
+    "    @classmethod\n",
+    "    def list(cls):\n",
+    "        return [c.value for c in cls]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae4a4255-5f13-4eef-a024-4f1de0f2173b",
+   "metadata": {},
+   "source": [
+    "We will be finetuning Mistral-7B model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f0eedef9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91c67b6377fc4dd7977bf544de784d51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Embedding(32027, 4096)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name = \"mistralai/Mistral-7B-v0.1\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_name,\n",
+    "    pad_token=SpecialTokens.pad_token.value,\n",
+    "    bos_token=SpecialTokens.bos_token.value,\n",
+    "    eos_token=SpecialTokens.end_target.value,\n",
+    "    additional_special_tokens=SpecialTokens.list(),\n",
+    ")\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    low_cpu_mem_usage=True\n",
+    "    # use_flash_attention_2=True, # leading to an error\n",
+    ")\n",
+    "model.resize_token_embeddings(len(tokenizer))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88439ed6-9974-4918-80df-ec78b05b4185",
+   "metadata": {},
+   "source": [
+    "## Apply LoRA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "80967087",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "trainable params: 31,886,720 || all params: 7,273,840,000 || trainable%: 0.43837532857472805\n",
+      "None\n",
+      "PeftModel(\n",
+      "  (base_model): LoraModel(\n",
+      "    (model): MistralForCausalLM(\n",
+      "      (model): MistralModel(\n",
+      "        (embed_tokens): lora.Embedding(\n",
+      "          (base_layer): Embedding(32027, 4096)\n",
+      "          (lora_dropout): ModuleDict(\n",
+      "            (default): Identity()\n",
+      "          )\n",
+      "          (lora_A): ModuleDict()\n",
+      "          (lora_B): ModuleDict()\n",
+      "          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 64x32027])\n",
+      "          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 4096x64])\n",
+      "        )\n",
+      "        (layers): ModuleList(\n",
+      "          (0-31): 32 x MistralDecoderLayer(\n",
+      "            (self_attn): MistralAttention(\n",
+      "              (q_proj): lora.Linear(\n",
+      "                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)\n",
+      "                (lora_dropout): ModuleDict(\n",
+      "                  (default): Identity()\n",
+      "                )\n",
+      "                (lora_A): ModuleDict(\n",
+      "                  (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "                )\n",
+      "                (lora_B): ModuleDict(\n",
+      "                  (default): Linear(in_features=64, out_features=4096, bias=False)\n",
+      "                )\n",
+      "                (lora_embedding_A): ParameterDict()\n",
+      "                (lora_embedding_B): ParameterDict()\n",
+      "              )\n",
+      "              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+      "              (v_proj): lora.Linear(\n",
+      "                (base_layer): Linear(in_features=4096, out_features=1024, bias=False)\n",
+      "                (lora_dropout): ModuleDict(\n",
+      "                  (default): Identity()\n",
+      "                )\n",
+      "                (lora_A): ModuleDict(\n",
+      "                  (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "                )\n",
+      "                (lora_B): ModuleDict(\n",
+      "                  (default): Linear(in_features=64, out_features=1024, bias=False)\n",
+      "                )\n",
+      "                (lora_embedding_A): ParameterDict()\n",
+      "                (lora_embedding_B): ParameterDict()\n",
+      "              )\n",
+      "              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+      "              (rotary_emb): MistralRotaryEmbedding()\n",
+      "            )\n",
+      "            (mlp): MistralMLP(\n",
+      "              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+      "              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+      "              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+      "              (act_fn): SiLU()\n",
+      "            )\n",
+      "            (input_layernorm): MistralRMSNorm()\n",
+      "            (post_attention_layernorm): MistralRMSNorm()\n",
+      "          )\n",
+      "        )\n",
+      "        (norm): MistralRMSNorm()\n",
+      "      )\n",
+      "      (lm_head): lora.Linear(\n",
+      "        (base_layer): Linear(in_features=4096, out_features=32027, bias=False)\n",
+      "        (lora_dropout): ModuleDict(\n",
+      "          (default): Identity()\n",
+      "        )\n",
+      "        (lora_A): ModuleDict(\n",
+      "          (default): Linear(in_features=4096, out_features=64, bias=False)\n",
+      "        )\n",
+      "        (lora_B): ModuleDict(\n",
+      "          (default): Linear(in_features=64, out_features=32027, bias=False)\n",
+      "        )\n",
+      "        (lora_embedding_A): ParameterDict()\n",
+      "        (lora_embedding_B): ParameterDict()\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "config = LoraConfig(\n",
+    "    r=64, lora_alpha=128, lora_dropout=0.0, target_modules=[\"embed_tokens\", \"lm_head\", \"q_proj\", \"v_proj\"]\n",
+    ")\n",
+    "model = get_peft_model(model, config)\n",
+    "print(model.print_trainable_parameters())\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15ac9945-4fcb-45f4-9478-d99a25a519cc",
+   "metadata": {},
+   "source": [
+    "## Preapre Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c6980d59-42d4-4a27-84cc-a9719302088b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "33d9539232da48f3ae922216b98ae462",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/986 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b7a33811d93742099140240cad91b679",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/247 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"smangrul/assistant_chatbot_dataset\")\n",
+    "dataset = dataset[\"train\"].train_test_split(0.2)\n",
+    "\n",
+    "text_column = \"context\"\n",
+    "label_column = \"target\"\n",
+    "max_length = 512\n",
+    "\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    batch_size = len(examples[text_column])\n",
+    "    targets = [str(x) for x in examples[label_column]]\n",
+    "    model_inputs = tokenizer(examples[text_column])\n",
+    "    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs\n",
+    "    for i in range(batch_size):\n",
+    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
+    "        label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n",
+    "        # print(i, sample_input_ids, label_input_ids)\n",
+    "        model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n",
+    "        labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n",
+    "        model_inputs[\"attention_mask\"][i] = [1] * len(model_inputs[\"input_ids\"][i])\n",
+    "    # print(model_inputs)\n",
+    "    for i in range(batch_size):\n",
+    "        sample_input_ids = model_inputs[\"input_ids\"][i]\n",
+    "        label_input_ids = labels[\"input_ids\"][i]\n",
+    "        model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n",
+    "            max_length - len(sample_input_ids)\n",
+    "        ) + sample_input_ids\n",
+    "        model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n",
+    "            \"attention_mask\"\n",
+    "        ][i]\n",
+    "        labels[\"input_ids\"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids\n",
+    "        model_inputs[\"input_ids\"][i] = model_inputs[\"input_ids\"][i][:max_length]\n",
+    "        model_inputs[\"attention_mask\"][i] = model_inputs[\"attention_mask\"][i][:max_length]\n",
+    "        labels[\"input_ids\"][i] = labels[\"input_ids\"][i][:max_length]\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "\n",
+    "processed_datasets = dataset.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=1,\n",
+    "    remove_columns=dataset[\"train\"].column_names,\n",
+    "    load_from_cache_file=False,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")\n",
+    "\n",
+    "train_dataset = processed_datasets[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5671b1ee-dca4-4705-8399-5c2967b9fb5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "    num_rows: 986\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3f38888e-4382-415b-869d-7202a816606a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataloader = DataLoader(\n",
+    "    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "53b9e552-4c5d-43e8-a9cd-8073af8d4280",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         ...,\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001],\n",
+       "         [32002, 32002, 32002,  ..., 32017, 32001, 32001]]),\n",
+       " 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         ...,\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1],\n",
+       "         [0, 0, 0,  ..., 1, 1, 1]]),\n",
+       " 'labels': tensor([[ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         ...,\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001],\n",
+       "         [ -100,  -100,  -100,  ..., 32017, 32001, 32001]])}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "next(iter(train_dataloader))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7de31ee2-185e-4658-9ad1-ae5f6bc3a611",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|><|begincontext|><|user|> Can you find me place to eat?<|system|> What kind of food would you like to have and where would you like me to search in?<|user|> Food kind of California will be perfect in SF.<|system|> There are 10 restaurants, Al's Place is one of the good restaurant in San Francisco.<|user|> Can you look for any other restaurant?<|system|> Alta Msp is one of the good restaurant in San Francisco.<|beginlastuserutterance|> Can you find me the address?<|endlastuserutterance|><|endcontext|><|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginrequestedslots|> Restaurants^street_address<|endrequestedslots|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->California<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^street_address~1275 Minnesota Street<|endaction|><|beginresponse|> The street address of the restaurant is 1275 Minnesota Street.<|endresponse|><|endtarget|><|endtarget|>\""
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.decode(train_dataset[0][\"input_ids\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "239d1c83-196d-471e-9bf7-5f36dafa9894",
+   "metadata": {},
+   "source": [
+    "# Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ec80d6ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
+      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33msmangrul\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/raid/sourab/temp/wandb/run-20231128_230934-edod21gq</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/smangrul/PeftExamples/runs/edod21gq' target=\"_blank\">ethereal-eon-1</a></strong> to <a href='https://wandb.ai/smangrul/PeftExamples' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/smangrul/PeftExamples' target=\"_blank\">https://wandb.ai/smangrul/PeftExamples</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/smangrul/PeftExamples/runs/edod21gq' target=\"_blank\">https://wandb.ai/smangrul/PeftExamples/runs/edod21gq</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='246' max='246' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [246/246 05:51, Epoch 2/2]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>5.189800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>3.745500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30</td>\n",
+       "      <td>2.371500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>40</td>\n",
+       "      <td>1.630200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>50</td>\n",
+       "      <td>1.302600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>60</td>\n",
+       "      <td>0.999400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>70</td>\n",
+       "      <td>0.704100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>80</td>\n",
+       "      <td>0.527800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>90</td>\n",
+       "      <td>0.509700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>100</td>\n",
+       "      <td>0.382300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>110</td>\n",
+       "      <td>0.318200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>120</td>\n",
+       "      <td>0.323500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>130</td>\n",
+       "      <td>0.263400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>140</td>\n",
+       "      <td>0.290900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>150</td>\n",
+       "      <td>0.277400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>160</td>\n",
+       "      <td>0.232800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>170</td>\n",
+       "      <td>0.223600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>180</td>\n",
+       "      <td>0.229600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>190</td>\n",
+       "      <td>0.233100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>200</td>\n",
+       "      <td>0.210200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>210</td>\n",
+       "      <td>0.245800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>220</td>\n",
+       "      <td>0.197300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>230</td>\n",
+       "      <td>0.210100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>240</td>\n",
+       "      <td>0.209800</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=246, training_loss=0.8516577879587809, metrics={'train_runtime': 354.9013, 'train_samples_per_second': 5.556, 'train_steps_per_second': 0.693, 'total_flos': 4.318233532091597e+16, 'train_loss': 0.8516577879587809, 'epoch': 2.0})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"mistral_lora_clm_with_added_tokens\",\n",
+    "    num_train_epochs=2,\n",
+    "    save_total_limit=5,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    warmup_steps=10,\n",
+    "    weight_decay=0.0001,\n",
+    "    dataloader_drop_last=True,\n",
+    "    bf16=True,\n",
+    "    logging_steps=10,\n",
+    "    learning_rate=1e-5,\n",
+    "    gradient_checkpointing=True,\n",
+    "    gradient_checkpointing_kwargs={\"use_reentrant\": False},\n",
+    "    remove_unused_columns=False,\n",
+    "    hub_model_id=\"smangrul/mistral_lora_clm_with_added_tokens\",\n",
+    "    push_to_hub=True,\n",
+    "    hub_private_repo=True,\n",
+    ")\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    data_collator=default_data_collator,\n",
+    ")\n",
+    "# model.config.use_cache = False\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bc1cbed-4eb9-4aaa-ab5f-5b91bf432307",
+   "metadata": {},
+   "source": [
+    "# Check the model output on a sample from evaluation dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "71851793",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "context=\"<|begincontext|><|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n",
+      "\n",
+      " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> Great, the phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n",
+      "\n",
+      " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "i = random.randint(0, len(dataset[\"test\"]))\n",
+    "context = dataset[\"test\"][i][\"context\"]\n",
+    "\n",
+    "batch = tokenizer(context, return_tensors=\"pt\")\n",
+    "batch = {k: v.to(\"cuda\") for k, v in batch.items()}\n",
+    "model.eval()\n",
+    "output_tokens = model.generate(\n",
+    "    **batch,\n",
+    "    max_new_tokens=256,\n",
+    "    do_sample=True,\n",
+    "    temperature=0.2,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    ")\n",
+    "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n",
+    "target = dataset[\"test\"][i][\"target\"]\n",
+    "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f940a660-2f7c-4a3a-b412-3f037aedb890",
+   "metadata": {},
+   "source": [
+    "# Save the Adapter model "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ebe05e9-9b93-42f6-bba8-46b8cc3d100f",
+   "metadata": {},
+   "source": [
+    "When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3d7459ba-caa8-4f10-aa70-89be4541cbdf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/raid/sourab/peft/src/peft/utils/save_and_load.py:128: UserWarning: Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\n",
+      "  warnings.warn(\"Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\")\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8d23186832014f209939ab83e79da011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3d831bc7d8843038364e821aacff5f1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_model.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84cc7a2a3a474bb791d61e2357dd229e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "events.out.tfevents.1701209373.hf-dgx-01.667111.0:   0%|          | 0.00/8.52k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ce2025dd01647599c00578044512c8c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/smangrul/mistral_lora_clm_with_added_tokens/commit/60ed7ea8bef10ce46d7a64229481dd1ad0e3d1c5', commit_message='Upload model', commit_description='', oid='60ed7ea8bef10ce46d7a64229481dd1ad0e3d1c5', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub()\n",
+    "trainer.model.push_to_hub(training_args.output_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66812cc4-f9a3-46c4-bcee-0cba03950685",
+   "metadata": {},
+   "source": [
+    "# Check the model loading is working as expected and generating plausible outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "589c46d7-d567-40b4-ab7d-e0a9e1cab40e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f98524da95b64a29a9016c6067313b2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aaae3bc0f52f45bbaab60687b71fc4cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1fc5754f41784d1aba00b93551894579",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "adapter_model.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "context=\"<|begincontext|><|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n",
+      "\n",
+      " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurant<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> The phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n",
+      "\n",
+      " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from peft import PeftModel\n",
+    "\n",
+    "inference_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    low_cpu_mem_usage=True,\n",
+    "    # use_flash_attention_2=True,\n",
+    ")\n",
+    "inference_model.resize_token_embeddings(len(tokenizer))\n",
+    "\n",
+    "inference_model = PeftModel.from_pretrained(inference_model, \"smangrul/mistral_lora_clm_with_added_tokens\")\n",
+    "inference_model.to(\"cuda\")\n",
+    "inference_model.eval()\n",
+    "\n",
+    "output_tokens = inference_model.generate(\n",
+    "    **batch,\n",
+    "    max_new_tokens=256,\n",
+    "    do_sample=True,\n",
+    "    temperature=0.2,\n",
+    "    top_p=0.95,\n",
+    "    top_k=50,\n",
+    "    eos_token_id=tokenizer.eos_token_id,\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    ")\n",
+    "\n",
+    "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n",
+    "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd57f6e8-761f-4e0b-941c-f6973e13b186",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index c5c7825baa..24ef48c22e 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -159,6 +159,8 @@ def save_pretrained(
         save_directory: str,
         safe_serialization: bool = True,
         selected_adapters: Optional[List[str]] = None,
+        save_embedding_layers: Union[str, bool] = "auto",
+        is_main_process: bool = True,
         **kwargs: Any,
     ):
         r"""
@@ -172,6 +174,14 @@ def save_pretrained(
                 exist).
             safe_serialization (`bool`, *optional*):
                 Whether to save the adapter files in safetensors format.
+            selected_adapters (`list(str)`,  *optional*):
+                A list of adapters to be saved. If `None`, will default to all adapters.
+            save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`):
+                If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common
+                embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available.
+                Based on it sets the boolean flag. This only works for 🤗 transformers models.
+            is_main_process (`bool`, *optional*):
+                Whether the process calling this is the main process or not. Will default to `True`.
             kwargs (additional keyword arguments, *optional*):
                 Additional keyword arguments passed along to the `push_to_hub` method.
         """
@@ -190,19 +200,23 @@ def save_pretrained(
                     f" {list(self.peft_config.keys())} - got {selected_adapters}."
                 )
 
-        os.makedirs(save_directory, exist_ok=True)
-        self.create_or_update_model_card(save_directory)
+        if is_main_process:
+            os.makedirs(save_directory, exist_ok=True)
+            self.create_or_update_model_card(save_directory)
 
         for adapter_name in selected_adapters:
             peft_config = self.peft_config[adapter_name]
             # save only the trainable weights
             output_state_dict = get_peft_model_state_dict(
-                self, state_dict=kwargs.get("state_dict", None), adapter_name=adapter_name
+                self,
+                state_dict=kwargs.get("state_dict", None),
+                adapter_name=adapter_name,
+                save_embedding_layers=save_embedding_layers,
             )
             output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory
             os.makedirs(output_dir, exist_ok=True)
 
-            if safe_serialization:
+            if is_main_process and safe_serialization:
                 # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
                 # Safetensors does not allow tensor aliasing.
                 # We're going to remove aliases before saving
@@ -230,7 +244,7 @@ def save_pretrained(
                     os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
                     metadata={"format": "pt"},
                 )
-            else:
+            elif is_main_process:
                 torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
 
             # save the config and change the inference mode to `True`
@@ -257,7 +271,8 @@ def save_pretrained(
             else:
                 auto_mapping_dict = None
 
-            peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
+            if is_main_process:
+                peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
             peft_config.inference_mode = inference_mode
 
     @classmethod
@@ -721,24 +736,27 @@ def create_or_update_model_card(self, output_dir: str):
         if hasattr(self.config, "quantization_config"):
             quantization_config = self.config.quantization_config.to_dict()
         training_config_text = ""
+        quantization_prefix = "The following `bitsandbytes` quantization config was used during training:"
         # Adds quantization information if it was used
         if quantization_config is not None:
-            training_config_text += "\nThe following `bitsandbytes` quantization config was used during training:\n"
+            training_config_text += f"\n{quantization_prefix}\n"
             training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()])
             training_config_text += "\n"
 
-        training_procedure_heading = "## Training procedure\n"
-        if training_procedure_heading in lines:
-            lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
-        else:
-            lines.append(f"{training_procedure_heading}\n{training_config_text}")
+        training_procedure_heading = "## Training procedure"
+        if quantization_prefix not in lines and bool(training_config_text):
+            if training_procedure_heading in lines:
+                lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
+            else:
+                lines.append(f"{training_procedure_heading}\n{training_config_text}")
 
         # Adds peft version
-        framework_block_heading = "### Framework versions\n"
-        if framework_block_heading in lines:
-            lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}\n")
-        else:
-            lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}\n")
+        framework_block_heading = "### Framework versions"
+        if f"- PEFT {__version__}" not in lines:
+            if framework_block_heading in lines:
+                lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}")
+            else:
+                lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}")
 
         card.text = "\n".join(lines)
         card.save(filename)
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index e811bee5ba..1c34701739 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -583,3 +583,4 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
 WEIGHTS_NAME = "adapter_model.bin"
 SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
 CONFIG_NAME = "adapter_config.json"
+EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"]
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 07e653bef1..97bde0d6fe 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import warnings
 from typing import Optional
 
 import torch
@@ -20,11 +21,26 @@
 from huggingface_hub.utils import EntryNotFoundError
 from safetensors.torch import load_file as safe_load_file
 
-from .other import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
+from .other import EMBEDDING_LAYER_NAMES, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, infer_device
 from .peft_types import PeftType
 
 
-def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", unwrap_compiled=False):
+def has_valid_embedding_base_layer(layer):
+    """Check if the layer has an embedding base layer"""
+    return hasattr(layer, "base_layer") and isinstance(layer.base_layer, (torch.nn.Linear, torch.nn.Embedding))
+
+
+def get_embedding_layer_name(model, layer, is_prompt_learning):
+    """Get the name of the embedding module for a given layer."""
+    for name, module in model.named_modules():
+        if (is_prompt_learning and module == layer) or module == layer.base_layer:
+            return name
+    return None
+
+
+def get_peft_model_state_dict(
+    model, state_dict=None, adapter_name="default", unwrap_compiled=False, save_embedding_layers="auto"
+):
     """
     Get the state dict of the Peft model.
 
@@ -37,6 +53,10 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", un
             The name of the adapter whose state dict should be returned.
         unwrap_compiled (`bool`, *optional*, defaults to `False`):
             Whether to unwrap the model if torch.compile was used.
+        save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`):
+            If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common embedding
+            layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. Based on it
+            sets the boolean flag. This only works for 🤗 transformers models.
     """
     if unwrap_compiled:
         model = getattr(model, "_orig_mod", model)
@@ -100,6 +120,27 @@ def get_peft_model_state_dict(model, state_dict=None, adapter_name="default", un
             if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
                 to_return[key.replace("modules_to_save.", "")] = value
 
+    # check the common embedding layers in `target_modules` to reset `save_embedding_layers` if necessary
+    if (
+        save_embedding_layers == "auto"
+        and hasattr(config, "target_modules")
+        and any(k in config.target_modules for k in EMBEDDING_LAYER_NAMES)
+    ):
+        warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
+        save_embedding_layers = True
+    elif save_embedding_layers == "auto":
+        save_embedding_layers = False
+
+    if save_embedding_layers and hasattr(model, "get_input_embeddings"):
+        for layer in [model.get_input_embeddings(), model.get_output_embeddings()]:
+            if config.is_prompt_learning or has_valid_embedding_base_layer(layer):
+                # support from version >= 0.6.2
+                embedding_module_name = get_embedding_layer_name(model, layer, config.is_prompt_learning)
+                if embedding_module_name:
+                    to_return.update({k: v for k, v in state_dict.items() if embedding_module_name in k})
+    elif save_embedding_layers:
+        warnings.warn("Could not identify embedding layer(s) because the model is not a 🤗 transformers model.")
+
     to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
     return to_return
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index 347df218b2..b298388a84 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -333,6 +333,33 @@ def forward(self, X):
         return X
 
 
+class ModelEmbWithEmbeddingUtils(nn.Module):
+    # Adds `get_input_embeddings` and `get_output_embeddings` methods to mimic 🤗 transformers models
+    def __init__(self):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(100, 5)
+        self.conv1d = Conv1D(1, 5)
+        self.relu = nn.ReLU()
+        self.flat = nn.Flatten()
+        self.lin0 = nn.Linear(10, 2)
+        self.sm = nn.LogSoftmax(dim=-1)
+
+    def forward(self, X):
+        X = self.embed_tokens(X)
+        X = self.conv1d(X)
+        X = self.relu(X)
+        X = self.flat(X)
+        X = self.lin0(X)
+        X = self.sm(X)
+        return X
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def get_output_embeddings(self):
+        return None
+
+
 class ModelConv2D(nn.Module):
     def __init__(self):
         super().__init__()
@@ -750,6 +777,55 @@ def test_non_existing_model_card(self):
         # rough check that the model card is pre-filled
         self.assertGreater(len(model_card), 1000)
 
+    @parameterized.expand(["auto", True, False])
+    def test_targeting_lora_to_embedding_layer(self, save_embedding_layers):
+        model = ModelEmbWithEmbeddingUtils()
+        config = LoraConfig(target_modules=["embed_tokens", "lin0"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            if save_embedding_layers == "auto":
+                # assert warning
+                msg_start = "Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`."
+                with self.assertWarns(UserWarning, msg=msg_start):
+                    model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            else:
+                model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            from safetensors.torch import load_file as safe_load_file
+
+            state_dict = safe_load_file(os.path.join(tmp_dirname, "adapter_model.safetensors"))
+            if save_embedding_layers in ["auto", True]:
+                self.assertTrue("base_model.model.embed_tokens.base_layer.weight" in state_dict)
+                self.assertTrue(
+                    torch.allclose(
+                        model.base_model.model.embed_tokens.base_layer.weight,
+                        state_dict["base_model.model.embed_tokens.base_layer.weight"],
+                    )
+                )
+            else:
+                self.assertFalse("base_model.model.embed_tokens.base_layer.weight" in state_dict)
+            del state_dict
+
+    @parameterized.expand(["auto", True, False])
+    def test_targeting_lora_to_embedding_layer_non_transformers(self, save_embedding_layers):
+        model = ModelEmbConv1D()
+        config = LoraConfig(target_modules=["emb", "lin0"], init_lora_weights=False)
+        model = get_peft_model(model, config)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            if save_embedding_layers is True:
+                # assert warning
+                msg_start = "Could not identify embedding layer(s) because the model is not a 🤗 transformers model."
+                with self.assertWarns(UserWarning, msg=msg_start):
+                    model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            else:
+                model.save_pretrained(tmp_dirname, save_embedding_layers=save_embedding_layers)
+            from safetensors.torch import load_file as safe_load_file
+
+            state_dict = safe_load_file(os.path.join(tmp_dirname, "adapter_model.safetensors"))
+            self.assertFalse("base_model.model.emb.base_layer.weight" in state_dict)
+            del state_dict
+
     @parameterized.expand(
         [
             LoraConfig(target_modules=["lin0"], init_lora_weights=False),

From 2b901ee57230559aaf39867c7698f6aca3617162 Mon Sep 17 00:00:00 2001
From: yxli2123 <69247082+yxli2123@users.noreply.github.com>
Date: Wed, 29 Nov 2023 11:08:17 -0500
Subject: [PATCH 4/6] Add LoftQ initialization method for LoRA (#1150)

---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 README.md                                     |   1 +
 examples/loftq_finetuning/README.md           |  69 ++
 .../loftq_finetuning/quantize_save_load.py    | 244 +++++
 .../loftq_finetuning/train_gsm8k_llama.py     | 866 ++++++++++++++++++
 requirements.txt                              |  15 +
 src/peft/__init__.py                          |   1 +
 src/peft/tuners/__init__.py                   |   2 +-
 src/peft/tuners/lora/__init__.py              |   4 +-
 src/peft/tuners/lora/config.py                |  45 +-
 src/peft/tuners/lora/layer.py                 |  48 +-
 src/peft/tuners/lora/model.py                 |   4 +
 src/peft/utils/loftq_utils.py                 | 227 +++++
 12 files changed, 1514 insertions(+), 12 deletions(-)
 create mode 100644 examples/loftq_finetuning/README.md
 create mode 100644 examples/loftq_finetuning/quantize_save_load.py
 create mode 100644 examples/loftq_finetuning/train_gsm8k_llama.py
 create mode 100644 requirements.txt
 create mode 100644 src/peft/utils/loftq_utils.py

diff --git a/README.md b/README.md
index 445cb26539..79259f98ee 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ Supported methods:
 7. MultiTask Prompt Tuning: [Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning](https://arxiv.org/abs/2303.02861)
 8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
 9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
+10. LoftQ: [LoftQ: LoRA-Fine-Tuning-aware Quantization for Large Language Models](https://arxiv.org/abs/2310.08659)
 
 ## Getting started
 
diff --git a/examples/loftq_finetuning/README.md b/examples/loftq_finetuning/README.md
new file mode 100644
index 0000000000..726f544e85
--- /dev/null
+++ b/examples/loftq_finetuning/README.md
@@ -0,0 +1,69 @@
+# LoftQ: LoRA-fine-tuning-aware Quantization
+
+## Introduction
+
+LoftQ provides better initialization for LoRA adapters A and B, 
+and the Quantization of pre-trained weights W.
+
+## Quantization
+We recommend to save the quantized backbone model as fp16/fp32 
+and load it as [NormalFloat4](https://arxiv.org/abs/2305.14314).
+
+We provide a simple example to show how to quantize llama-2-7b model and save/load it.
+
+```sh
+python quantize_save_load.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --token HF_TOKEN \
+    --bits 4 --iter 5 --rank 16 \
+    --save_dir model_zoo/loftq/
+```
+
+- `HF_TOKEN` is the token used to access to [LLAMA models](https://huggingface.co/meta-llama).
+- `quantize_and_save()` function will quantize the backbone and initialize LoRA adapters. 
+It creates 2 folders under `$save_dir`. The quantized backbone is at `Llama-2-7b-hf-4bit-16rank`,
+and the LoRA adapters are at the sub-folder `Llama-2-7b-hf-4bit-16rank/loftq_init`.
+
+## Fine-tuning
+
+Here is an example to load the quantized backbone and LoRA adapters:
+
+```python
+import os
+
+from transformers import AutoModelForCausalLM
+from peft import PeftModel
+
+
+base_model = AutoModelForCausalLM.from_pretrained(
+    os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank"), 
+    load_in_4bit=True,
+)
+peft_model = PeftModel.from_pretrained(
+    base_model,
+    os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank", "loftq_init"),
+    is_trainable=True,
+)
+```
+
+We also provide an example to fine-tune LoftQ on GSM8K. 
+We load the quantized backbone and LoRA adapters from the [LoftQ Huggingface hub](https://huggingface.co/LoftQ).
+
+```sh
+python train_gsm8k_llama.py \
+    --model_name_or_path LoftQ/Llama-2-7b-hf-4bit-64rank \
+    --output_dir exp_results/gsm8k/llama-2-7b/bit4-rank64/lr3e-4 \
+    --learning_rate 3e-4  \
+    --seed 202 \
+    --dataset_name gsm8k \
+    --dataset_config main \
+    --pad_to_max_length \
+    --max_source_length 128 \
+    --max_target_length 256 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --with_tracking \
+    --report_to tensorboard
+```
diff --git a/examples/loftq_finetuning/quantize_save_load.py b/examples/loftq_finetuning/quantize_save_load.py
new file mode 100644
index 0000000000..3c47fa45cd
--- /dev/null
+++ b/examples/loftq_finetuning/quantize_save_load.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+
+from peft import LoftQConfig, LoraConfig, PeftModel, TaskType, get_peft_model
+
+
+class Shell(nn.Module):
+    def __init__(self, weight, bias=None):
+        super().__init__()
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        if bias is not None:
+            self.bias = nn.Parameter(bias, requires_grad=False)
+
+
+def unwarap_model(model, sub_module_name=".base_layer"):
+    sub_module_name_list = [k.split(sub_module_name)[0] for k in model.state_dict().keys() if sub_module_name in k]
+    sub_module_name_set = set(sub_module_name_list)
+    for name in sub_module_name_set:
+        # get the parent of the submodule
+        name_parent = ".".join(name.split(".")[:-1])
+        name_child = name.split(".")[-1]
+        sub_module = model.get_submodule(name_parent)
+        print(sub_module)
+
+        # replace with shell
+        child = getattr(sub_module, name_child)
+        weight = getattr(child.base_layer, "weight", None)
+        bias = getattr(child.base_layer, "bias", None)
+        shell = Shell(weight, bias)
+
+        setattr(sub_module, name_child, shell)
+
+    print("You have unwrapped the model. Use it on your own risk.")
+
+
+def print_model(model, name):
+    print("=" * 10 + name + "=" * 10)
+    print(model)
+    for name, param in model.named_parameters():
+        if torch.is_tensor(param):
+            if param.dtype in [torch.float32, torch.float16]:
+                print(
+                    name,
+                    param.shape,
+                    param.device,
+                    param.dtype,
+                    param.requires_grad,
+                    param.mean().item(),
+                    param.max().item(),
+                )
+            else:
+                print(name, param.shape, param.device, param.dtype, param.requires_grad)
+
+
+def arg_parse():
+    parser = argparse.ArgumentParser(description="Quantize a model with LoftQ.")
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="The name or path of the fp32/16 model.",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=None,
+        help="The access token to download model from HuggingFace Hub.",
+    )
+    parser.add_argument(
+        "--bits",
+        type=int,
+        default=4,
+        help="The quantized bits",
+    )
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=1,
+        help="The alternating steps in LoftQ",
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=16,
+        help="The rank of the LoRA adapter",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="./model_zoo/loftq/",
+        help="The rank of the LoRA adapter",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def quantize_and_save():
+    args = arg_parse()
+
+    # Download weights and configure LoRA
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True)
+    if any(name in args.model_name_or_path.lower() for name in ["llama", "mistral", "falcon"]):
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, token=args.token, trust_remote_code=True, device_map="auto"
+        )
+        task_type = TaskType.CAUSAL_LM
+        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"]
+
+    elif any(name in args.model_name_or_path.lower() for name in ["bart", "t5"]):
+        model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path, token=args.token, device_map="auto")
+        task_type = TaskType.SEQ_2_SEQ_LM
+        target_modules = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "out_proj"]
+
+    elif any(name in args.model_name_or_path.lower() for name in ["deberta", "roberta", "bert"]):
+        model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, token=args.token)
+        model = model.cuda()
+        task_type = TaskType.SEQ_CLS
+        target_modules = ["query_proj", "key_proj", "value_proj", "dense"]  # embeddings not supported by peft
+    else:
+        raise NotImplementedError("Other models not supported yet.")
+
+    # Config of LoftQ
+    loftq_config = LoftQConfig(loftq_bits=args.bits, loftq_iter=args.iter)
+
+    lora_config = LoraConfig(
+        task_type=task_type,
+        inference_mode=True,
+        r=args.rank,
+        lora_alpha=16 if task_type is TaskType.CAUSAL_LM else args.rank,
+        lora_dropout=0.1,
+        target_modules=target_modules,
+        init_lora_weights="loftq",
+        loftq_config=loftq_config,
+    )
+
+    # Obtain LoftQ model
+    lora_model = get_peft_model(model, lora_config)
+    base_model = lora_model.get_base_model()
+
+    # Save LoftQ model
+    model_name = args.model_name_or_path.split("/")[-1] + f"-{args.bits}bit" + f"-{args.rank}rank"
+    base_model_dir = os.path.join(args.save_dir, model_name)
+    lora_model_dir = os.path.join(args.save_dir, model_name, "loft_init")
+
+    # save lora adapters first
+    lora_model.base_model.peft_config[
+        "default"
+    ].base_model_name_or_path = base_model_dir  # This can be a local path or Hub model id
+    lora_model.base_model.peft_config["default"].init_lora_weights = True  # Don't apply LoftQ when loading again
+
+    lora_model.save_pretrained(lora_model_dir)
+    print_model(lora_model, "lora_model")
+
+    # remove lora adapters and save the backbone
+    unwarap_model(base_model)
+    base_model.save_pretrained(base_model_dir)
+    tokenizer.save_pretrained(base_model_dir)
+
+    print_model(base_model, "base_model")
+
+    return base_model_dir, lora_model_dir
+
+
+def load_loftq(base_model_path, lora_adapter_path):
+    if any(name in base_model_path.lower() for name in ["llama", "mistral", "falcon"]):
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_path,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    elif any(name in base_model_path.lower() for name in ["bart", "t5"]):
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            base_model_path,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            load_in_4bit=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    elif any(name in base_model_path.lower() for name in ["deberta", "roberta", "bert"]):
+        model = AutoModelForSequenceClassification.from_pretrained(
+            base_model_path,
+            low_cpu_mem_usage=True,
+            load_in_4bit=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+            ),
+        )
+    else:
+        raise NotImplementedError("Other models not supported yet.")
+
+    lora_model = PeftModel.from_pretrained(model, lora_adapter_path, is_trainable=True)
+
+    # Do training or inference below
+    print_model(lora_model, "lora_model")
+    print_model(model, "base_model")
+
+
+if __name__ == "__main__":
+    base_dir, lora_dir = quantize_and_save()
+    load_loftq(base_dir, lora_dir)
+
+# example command:
+# python quantize_save_load.py \
+# --model_name_or_path meta-llama/Llama-2-7b-hf \
+# --token XXX \
+# --bits 4 --iter 5 --rank 16 \
+# --save_dir ./model_zoo/loftq/
diff --git a/examples/loftq_finetuning/train_gsm8k_llama.py b/examples/loftq_finetuning/train_gsm8k_llama.py
new file mode 100644
index 0000000000..e8c3580d2e
--- /dev/null
+++ b/examples/loftq_finetuning/train_gsm8k_llama.py
@@ -0,0 +1,866 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import logging
+import math
+import os
+import random
+import re
+from pathlib import Path
+
+import datasets
+import torch
+import transformers
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+)
+from transformers.utils import send_example_telemetry
+from transformers.utils.versions import require_version
+
+from peft import PeftModel
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.32.0.dev0")
+
+logger = get_logger(__name__)
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+HF_TOKEN = "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"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--ignore_pad_token_for_loss",
+        type=bool,
+        default=True,
+        help="Whether to ignore the tokens corresponding to padded labels in the loss computation or not.",
+    )
+    parser.add_argument(
+        "--max_source_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after "
+            "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded."
+        ),
+    )
+    parser.add_argument(
+        "--max_target_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total sequence length for target text after "
+            "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
+            "during ``evaluate`` and ``predict``."
+        ),
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--trust_remote_code",
+        type=bool,
+        default=False,
+        help=(
+            "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
+            "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
+            "execute code present on the Hub on your local machine."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help=(
+            "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
+            "If passed, LLM loading time and RAM consumption will be benefited."
+        ),
+    )
+    ##########################
+    #   Generation Config    #
+    ##########################
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.8,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument("--k", type=int, default=40, help="Choose k candidate words")
+    parser.add_argument("--p", type=float, default=0.95, help="The sum of probability of candidate words is 0.9 ")
+
+    ##########################
+    #        Exp Args        #
+    ##########################
+    parser.add_argument(
+        "--adapter_name_or_path",
+        type=str,
+        default=None,
+        help=(
+            "The LoRA adapter checkpoint. Set None if you want to fine-tune from LoftQ."
+            "Specify a path if you want to evaluate."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_clm_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
+            # Clone repo locally
+            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+                **dataset_args,
+            )
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(
+            args.config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
+        )
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path,
+            use_fast=not args.use_slow_tokenizer,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    ##########################
+    #        Tokenizer       #
+    ##########################
+    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
+    tokenizer.padding_side = "left"  # Allow batched inference
+    tokenizer.truncation_side = "left"
+
+    if args.model_name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            low_cpu_mem_usage=True,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=config.torch_dtype,
+            ),
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code)
+
+    ##########################
+    #       Peft Model       #
+    ##########################
+    if args.adapter_name_or_path is None:
+        model = PeftModel.from_pretrained(model, args.model_name_or_path, subfolder="loftq_init", is_trainable=True)
+    else:
+        model = PeftModel.from_pretrained(model, args.adapter_name_or_path, is_trainable=True)
+    model.print_trainable_parameters()
+
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    ##########################
+    #      GSM8K dataset     #
+    ##########################
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+
+    # Get the column names for source/target.
+    source_column, target_column = "question", "answer"
+
+    # Temporarily set max_target_length for training.
+    padding = "max_length" if args.pad_to_max_length else False
+    task_prompt = "\nAnswer the above question. First think step by step and then answer the final number.\n"
+
+    def prompt_process(sent_1, sent_2, prompt_1="", prompt_2="", prompt_3=""):
+        sent_2 = sent_2.replace("####", "The final answer is")
+        return prompt_1 + sent_1 + prompt_2 + sent_2 + prompt_3
+
+    def preprocess_function_train(examples):
+        sources = examples[source_column]
+        targets = examples[target_column]
+
+        inputs = [prompt_process(source, target, prompt_2=task_prompt) for (source, target) in zip(sources, targets)]
+
+        model_inputs = tokenizer(
+            inputs,
+            max_length=args.max_source_length + args.max_target_length,
+            padding=padding,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        labels = copy.deepcopy(model_inputs)
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and args.ignore_pad_token_for_loss:
+            # get the length of the target tokens. -1 to kick out the <BOS> token
+            target_tokens = tokenizer(targets, padding=False)
+            target_len = [len(label) - 1 for label in target_tokens["input_ids"]]
+
+            # don't calculate the loss from source and padding (left padding)
+            for i in range(len(labels["input_ids"])):
+                labels["input_ids"][i, : -target_len[i]] = -100
+
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+
+    def preprocess_function_test(examples):
+        sources = examples[source_column]
+        labels = examples[target_column]
+
+        inputs = [source + task_prompt for source in sources]
+
+        model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
+        labels = tokenizer(labels, max_length=args.max_target_length, padding=padding, truncation=True)
+
+        model_inputs["labels"] = labels["input_ids"]
+
+        return model_inputs
+
+    with accelerator.main_process_first():
+        train_dataset = raw_datasets["train"].map(
+            preprocess_function_train,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on training dataset",
+        )
+
+        eval_dataset = raw_datasets["test"].map(
+            preprocess_function_test,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on test dataset",
+        )
+
+    # Log a few random samples from the set:
+    for index in random.sample(range(len(train_dataset)), 2):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    for index in random.sample(range(len(eval_dataset)), 2):
+        logger.info(f"Sample {index} of the validation set: {eval_dataset[index]}.")
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and "lora" in n],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+    )
+
+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
+        accelerator.init_trackers("clm_no_trainer", experiment_config)
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            checkpoint_path = args.resume_from_checkpoint
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(path)
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
+            starting_epoch = resume_step // len(train_dataloader)
+            resume_step -= starting_epoch * len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                accelerator.print(f"Epoch: {epoch} | Step: {step} | Loss: {loss}")
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        gen_kwargs = {
+            "max_new_tokens": args.max_target_length,
+            "temperature": args.temperature,
+            "top_k": args.k,
+            "top_p": args.p,
+            "do_sample": True,
+        }
+        ans_pred_list = []
+        ans_gold_list = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                gen_kwargs["input_ids"] = batch["input_ids"]
+                gen_kwargs["attention_mask"] = batch["attention_mask"]
+                generated_tokens = accelerator.unwrap_model(model).generate(**gen_kwargs)
+
+            pred_tokens = generated_tokens[:, args.max_source_length :]
+            pred_tokens = accelerator.pad_across_processes(pred_tokens, dim=1, pad_index=tokenizer.pad_token_id)
+            gold_tokens = batch["labels"]
+
+            if not args.pad_to_max_length:
+                # If we did not pad to max length, we need to pad the labels too
+                gold_tokens = accelerator.pad_across_processes(
+                    batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
+                )
+
+            pred_tokens, gold_tokens = accelerator.gather_for_metrics((pred_tokens, gold_tokens))
+            pred_tokens, gold_tokens = pred_tokens.cpu().numpy(), gold_tokens.cpu().numpy()
+
+            if isinstance(pred_tokens, tuple):
+                pred_tokens = pred_tokens[0]
+            decoded_pred = tokenizer.batch_decode(pred_tokens, skip_special_tokens=True)
+            decoded_gold = tokenizer.batch_decode(gold_tokens, skip_special_tokens=True)
+
+            # Extract the numbers in sentences
+            accelerator.print(decoded_pred)
+            ans_pred_list += [extract_answer_number(sentence_pred) for sentence_pred in decoded_pred]
+            ans_gold_list += [extract_answer_number(sentence_gold) for sentence_gold in decoded_gold]
+
+        accelerator.print(ans_pred_list)
+        accelerator.print(ans_gold_list)
+        accuracy = compute_accuracy(ans_gold_list, ans_pred_list)
+
+        logger.info(f"epoch {epoch}: accuracy: {accuracy}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "accuracy": accuracy,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+        )
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
+
+
+PATTERN_NUMBER = re.compile(r"-?\d+\.?\d*")
+
+
+def extract_answer_number(sentence: str) -> float:
+    sentence = sentence.replace(",", "")
+    pred = PATTERN_NUMBER.findall(sentence)
+    if not pred:
+        return float("inf")
+    segment = sentence.split("The final answer is ")
+    if len(segment) > 1:
+        pred_answer = segment[1]
+        pred_answer = PATTERN_NUMBER.findall(pred_answer)
+        if len(pred_answer) > 0:
+            pred_answer = pred_answer[0]
+        else:
+            pred_answer = float(pred[-1])
+    else:
+        pred_answer = float(pred[-1])
+
+    if isinstance(pred_answer, str):
+        try:
+            pred_answer = float(pred_answer)
+        except ValueError:
+            pred_answer = float("inf")
+    return pred_answer
+
+
+def compute_accuracy(pred: list, gold: list):
+    acc = 0.0
+    for p, g in zip(pred, gold):
+        if p == g:
+            acc += 1
+
+    return acc / len(pred)
+
+
+if __name__ == "__main__":
+    main()
+
+# example command
+
+# python train_gsm8k_llama.py \
+# --model_name_or_path LoftQ/Llama-2-7b-hf-bit4-rank64-backbone \
+# --adapter_name_or_path LoftQ/Llama-2-7b-hf-bit4-rank64-adapters \
+# --output_dir exp_results/gsm8k/llama-2-7b/bit4-rank64/lr3e-4 \
+# --learning_rate 1e-4  \
+# --seed 202 \
+# --dataset_name gsm8k \
+# --dataset_config main \
+# --pad_to_max_length \
+# --max_source_length 128 \
+# --max_target_length 256 \
+# --num_train_epochs 5 \
+# --per_device_train_batch_size 4 \
+# --per_device_eval_batch_size 4 \
+# --gradient_accumulation_steps 4 \
+# --with_tracking \
+# --report_to tensorboard
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..dca857de32
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+accelerate
+torch
+safetensors
+bitsandbytes
+scipy
+peft
+transformers
+tqdm
+packaging
+pytest
+numpy
+pyyaml
+datasets
+psutil
+setuptools
\ No newline at end of file
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index a3ce332f24..4d9380e697 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -48,6 +48,7 @@
     AdaptionPromptConfig,
     AdaptionPromptModel,
     LoraConfig,
+    LoftQConfig,
     LoraModel,
     LoHaConfig,
     LoHaModel,
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index b357d47dc1..666e29d997 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -18,7 +18,7 @@
 # limitations under the License.
 
 from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
-from .lora import LoraConfig, LoraModel
+from .lora import LoraConfig, LoraModel, LoftQConfig
 from .loha import LoHaConfig, LoHaModel
 from .lokr import LoKrConfig, LoKrModel
 from .ia3 import IA3Config, IA3Model
diff --git a/src/peft/tuners/lora/__init__.py b/src/peft/tuners/lora/__init__.py
index d02bf2d948..ddc81d53cd 100644
--- a/src/peft/tuners/lora/__init__.py
+++ b/src/peft/tuners/lora/__init__.py
@@ -15,13 +15,13 @@
 
 from peft.import_utils import is_bnb_4bit_available, is_bnb_available
 
-from .config import LoraConfig
+from .config import LoftQConfig, LoraConfig
 from .gptq import QuantLinear
 from .layer import Conv2d, Embedding, Linear, LoraLayer
 from .model import LoraModel
 
 
-__all__ = ["LoraConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
+__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
 
 
 if is_bnb_available():
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index b1e31d8198..0dcca5c1e6 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -22,6 +22,25 @@
 from peft.utils import PeftType
 
 
+@dataclass
+class LoftQConfig:
+    """
+    This is the sub-configuration class to store the configuration of a [`LoraModel`].
+
+    Args:
+        bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the
+            default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}.
+        bits (`int`): Quantization bits for LoftQ.
+        iter (`int`): Alternating iterations for LoftQ.
+        fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear
+            models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4
+            bits.
+    """
+
+    loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"})
+    loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"})
+
+
 @dataclass
 class LoraConfig(PeftConfig):
     """
@@ -78,7 +97,7 @@ class LoraConfig(PeftConfig):
             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
         },
     )
-    init_lora_weights: bool | Literal["gaussian"] = field(
+    init_lora_weights: bool | Literal["gaussian", "loftq"] = field(
         default=True,
         metadata={
             "help": (
@@ -86,6 +105,7 @@ class LoraConfig(PeftConfig):
                 "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
                 "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
                 "to False leads to completely random initialization and is discouraged."
+                "Pass `'loftq'` to use LoftQ initialization"
             ),
         },
     )
@@ -121,6 +141,16 @@ class LoraConfig(PeftConfig):
             )
         },
     )
+    # dict type is used when loading config.json
+    loftq_config: Union[LoftQConfig, dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone "
+                "weights and initialize Lora layers."
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.LORA
@@ -134,3 +164,16 @@ def __post_init__(self):
         # if target_modules is a regex expression, then layers_pattern should be None
         if isinstance(self.target_modules, str) and self.layers_pattern is not None:
             raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
+
+        # handle init_lora_weights and loftq_config
+        if self.init_lora_weights == "loftq":
+            import importlib
+
+            if not importlib.util.find_spec("scipy"):
+                raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+            if self.loftq_config is None:
+                raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.")
+
+        # convert loftq_config to dict
+        if self.loftq_config is not None and not isinstance(self.loftq_config, dict):
+            self.loftq_config = vars(self.loftq_config)
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index 5ea726d2ff..cf97108c87 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -15,7 +15,7 @@
 
 import math
 import warnings
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -46,6 +46,7 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         # Mark the weight as unmerged
         self._disable_adapters = False
         self.merged_adapters = []
+        self.kwargs = kwargs
 
         base_layer = self.get_base_layer()
         if isinstance(base_layer, nn.Linear):
@@ -83,7 +84,10 @@ def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weig
             self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
             self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
             self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(self.get_base_layer(), "weight", None)
@@ -115,7 +119,10 @@ def update_layer_conv2d(self, adapter_name, r, lora_alpha, lora_dropout, init_lo
             self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
             self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
             self.scaling[adapter_name] = lora_alpha / r
-        if init_lora_weights:
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
             self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         weight = getattr(base_layer, "weight", None)
@@ -142,7 +149,11 @@ def update_layer_embedding(self, adapter_name, r, lora_alpha, lora_dropout, init
             self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
             self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
             self.scaling[adapter_name] = lora_alpha / r
-        self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
 
         base_layer = self.get_base_layer()
         weight = getattr(base_layer, "weight", None)
@@ -170,6 +181,27 @@ def reset_lora_parameters(self, adapter_name, init_lora_weights):
             nn.init.zeros_(self.lora_embedding_A[adapter_name])
             nn.init.normal_(self.lora_embedding_B[adapter_name])
 
+    def loftq_init(self, adapter_name):
+        from peft.utils.loftq_utils import loftq_init
+
+        weight = self.get_base_layer().weight
+        kwargs = {
+            "num_bits": self.kwargs.get("loftq_bits", 4),
+            "reduced_rank": self.r[adapter_name],
+            "num_iter": self.kwargs.get("loftq_iter", 1),
+        }
+
+        qweight, lora_A, lora_B = loftq_init(weight, **kwargs)
+        if adapter_name in self.lora_A.keys():
+            # initialize A the same way as the default for nn.Linear and B to zero
+            self.lora_A[adapter_name].weight.data = lora_A
+            self.lora_B[adapter_name].weight.data = lora_B
+        if adapter_name in self.lora_embedding_A.keys():
+            # initialize a the same way as the default for nn.linear and b to zero
+            self.lora_embedding_A[adapter_name].weight.data = lora_A
+            self.lora_embedding_B[adapter_name].weight.data = lora_B
+        self.get_base_layer().weight.data = qweight
+
     def set_scale(self, adapter, scale):
         if adapter not in self.scaling:
             # Ignore the case where the adapter is not in the layer
@@ -218,11 +250,11 @@ def __init__(
         lora_dropout: float = 0.0,
         fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
         is_target_conv_1d_layer: bool = False,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
-        LoraLayer.__init__(self, base_layer)
+        LoraLayer.__init__(self, base_layer, **kwargs)
         self.fan_in_fan_out = fan_in_fan_out
 
         self._active_adapter = adapter_name
@@ -351,7 +383,7 @@ def __init__(
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -491,7 +523,7 @@ def __init__(
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        init_lora_weights: bool = True,
+        init_lora_weights: Union[bool, str] = True,
         **kwargs,
     ) -> None:
         super().__init__()
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 653a684276..6e0a64187a 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -286,8 +286,10 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
         elif isinstance(target_base_layer, torch.nn.Embedding):
             embedding_kwargs = kwargs.copy()
             embedding_kwargs.pop("fan_in_fan_out", None)
+            embedding_kwargs.update(lora_config.loftq_config)
             new_module = Embedding(target, adapter_name, **embedding_kwargs)
         elif isinstance(target_base_layer, torch.nn.Conv2d):
+            kwargs.update(lora_config.loftq_config)
             new_module = Conv2d(target, adapter_name, **kwargs)
         elif isinstance(target_base_layer, torch.nn.Linear):
             if kwargs["fan_in_fan_out"]:
@@ -296,6 +298,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "Setting fan_in_fan_out to False."
                 )
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, **kwargs)
         elif isinstance(target_base_layer, Conv1D):
             if not kwargs["fan_in_fan_out"]:
@@ -304,6 +307,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                     "Setting fan_in_fan_out to True."
                 )
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs)
         else:
             raise ValueError(
diff --git a/src/peft/utils/loftq_utils.py b/src/peft/utils/loftq_utils.py
new file mode 100644
index 0000000000..81ff1e2c34
--- /dev/null
+++ b/src/peft/utils/loftq_utils.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reference code: https://github.com/yxli2123/LoftQ/blob/main/utils.py
+# Reference paper: https://arxiv.org/abs/2310.08659
+
+import logging
+from typing import Union
+
+import torch
+
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+
+class NFQuantizer:
+    def __init__(self, num_bits=2, device="cuda", method="normal", block_size=64, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_bits = num_bits
+        self.device = device
+        self.method = method
+        self.block_size = block_size
+        if self.method == "normal":
+            self.norm_lookup_table = self.create_normal_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        elif self.method == "uniform":
+            self.norm_lookup_table = self.create_uniform_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        else:
+            raise NotImplementedError("Other quantization methods not supported yet.")
+
+    @staticmethod
+    def create_uniform_map(symmetric=False, num_bits=4):
+        if symmetric:
+            # print("symmetric uniform quantization")
+            negative = torch.linspace(-1, 0, 2 ** (num_bits - 1))
+            positive = torch.linspace(0, 1, 2 ** (num_bits - 1))
+            table = torch.cat([negative, positive[1:]])
+        else:
+            # print("asymmetric uniform quantization")
+            table = torch.linspace(-1, 1, 2**num_bits)
+        return table
+
+    @staticmethod
+    def create_normal_map(offset=0.9677083, symmetric=False, num_bits=2):
+        try:
+            from scipy.stats import norm
+        except ImportError:
+            raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+
+        variations = 2**num_bits
+        if symmetric:
+            v = norm.ppf(torch.linspace(1 - offset, offset, variations + 1)).tolist()
+            values = []
+            for index in range(len(v) - 1):
+                values.append(0.5 * v[index] + 0.5 * v[index + 1])
+            v = values
+        else:
+            # one more positive value, this is an asymmetric type
+            v1 = norm.ppf(torch.linspace(offset, 0.5, variations // 2 + 1)[:-1]).tolist()
+            v2 = [0]
+            v3 = (-norm.ppf(torch.linspace(offset, 0.5, variations // 2)[:-1])).tolist()
+            v = v1 + v2 + v3
+
+        values = torch.Tensor(v)
+        values = values.sort().values
+        values /= values.max()
+        return values
+
+    def quantize_tensor(self, weight):
+        max_abs = torch.abs(weight).max()
+        weight_normed = weight / max_abs
+
+        weight_normed_expanded = weight_normed.unsqueeze(-1)
+
+        # Reshape L to have the same number of dimensions as X_expanded
+        L_reshaped = torch.tensor(self.norm_lookup_table).reshape(1, -1)
+
+        # Calculate the absolute difference between X_expanded and L_reshaped
+        abs_diff = torch.abs(weight_normed_expanded - L_reshaped)
+
+        # Find the index of the minimum absolute difference for each element
+        qweight = torch.argmin(abs_diff, dim=-1)
+        return qweight, max_abs
+
+    def dequantize_tensor(self, qweight, max_abs):
+        qweight_flatten = qweight.flatten()
+
+        weight_normed = self.norm_lookup_table[qweight_flatten]
+        weight = weight_normed * max_abs
+
+        weight = weight.reshape(qweight.shape)
+
+        return weight
+
+    def quantize_block(self, weight):
+        if len(weight.shape) != 2:
+            raise ValueError(f"Only support 2D matrix, but your input has {len(weight.shape)} dimensions.")
+        if weight.shape[0] * weight.shape[1] % self.block_size != 0:
+            raise ValueError(
+                f"Weight with shape ({weight.shape[0]} x {weight.shape[1]}) "
+                f"is not dividable by block size {self.block_size}."
+            )
+
+        M, N = weight.shape
+        device = weight.device
+
+        # Quantization
+        weight_flatten = weight.flatten()  # (M*N, )
+        weight_block = weight_flatten.reshape(-1, self.block_size)  # (L, B), L = M * N / B
+        if self.method == "normal":
+            weight_max = weight_block.abs().max(dim=-1)[0]  # (L, 1)
+        elif self.method == "uniform":
+            weight_max = weight_block.mean(dim=-1) + 2.5 * weight_block.std(dim=-1)
+        else:
+            raise NotImplementedError("Method not supported yet.")
+        weight_max = weight_max.unsqueeze(-1)
+        weight_divabs = weight_block / weight_max  # (L, B)
+        weight_divabs = weight_divabs.unsqueeze(-1)  # (L, B, 1)
+        L_reshaped = self.norm_lookup_table.reshape(1, -1)  # (1, 2**K)
+
+        abs_diff = torch.abs(weight_divabs - L_reshaped)  # (L, B, 2**K)
+        qweight = torch.argmin(abs_diff, dim=-1)  # (L, B)
+
+        # Pack multiple k-bit into uint8
+        qweight = qweight.reshape(-1, 8 // self.num_bits)
+        qweight_pack = torch.zeros((M * N // 8 * self.num_bits, 1), dtype=torch.uint8, device=device)
+
+        # data format example:
+        # [1, 0, 3, 2] or [01, 00, 11, 10]  -> [10110001], LIFO
+        for i in range(8 // self.num_bits):
+            qweight[:, i] = qweight[:, i] << i * self.num_bits
+            qweight_pack[:, 0] |= qweight[:, i]
+
+        return qweight_pack, weight_max, weight.shape
+
+    def dequantize_block(self, qweight, weight_max, weight_shape):
+        # unpack weight
+        device = qweight.device
+        weight = torch.zeros((qweight.shape[0], 8 // self.num_bits), dtype=torch.float32, device=device)
+        for i in range(8 // self.num_bits):
+            lookup_table_idx = qweight.to(torch.long) % 2**self.num_bits  # get the most right 2 bits
+            lookup_table_idx = lookup_table_idx.to(torch.int)
+            weight[:, i] = self.norm_lookup_table[lookup_table_idx].squeeze()
+            qweight = qweight >> self.num_bits  # right shift 2 bits of the original data
+
+        weight_block = weight.reshape(-1, self.block_size)
+        weight = weight_block * weight_max
+        weight = weight.reshape(weight_shape)
+
+        return weight
+
+
+def _low_rank_decomposition(weight, reduced_rank=32):
+    """
+    :param weight: The matrix to decompose, of shape (H, W) :param reduced_rank: the final rank :return:
+    """
+    matrix_dimension = len(weight.size())
+    if matrix_dimension != 2:
+        raise ValueError(f"Only support 2D matrix, but your input has {matrix_dimension} dimensions.")
+
+    # Use SVD to decompose a matrix, default full_matrices is False to save parameters
+    U, S, Vh = torch.linalg.svd(weight, full_matrices=False)
+
+    L = U @ (torch.sqrt(torch.diag(S)[:, 0:reduced_rank]))
+    R = torch.sqrt(torch.diag(S)[0:reduced_rank, :]) @ Vh
+
+    return {"L": L, "R": R, "U": U, "S": S, "Vh": Vh, "reduced_rank": reduced_rank}
+
+
+@torch.no_grad()
+def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, reduced_rank: int, num_iter=1):
+    if num_bits not in [2, 4, 8]:
+        raise ValueError("Only support 2, 4, 8 bits quantization")
+    if num_iter <= 0:
+        raise ValueError("Number of iterations must be greater than 0")
+
+    out_feature, in_feature = weight.size()
+    device = weight.device
+    dtype = weight.dtype
+
+    logging.info(
+        f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} "
+        f"| Num Iter: {num_iter} | Num Bits: {num_bits}"
+    )
+    if not is_bnb_4bit_available():
+        quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64)
+
+    weight = weight.to(torch.float32)
+    res = weight.clone()
+    for i in range(num_iter):
+        torch.cuda.empty_cache()
+        # Quantization
+        if num_bits == 4 and is_bnb_4bit_available():
+            qweight = bnb.nn.Params4bit(
+                res.to("cpu"), requires_grad=False, compress_statistics=False, quant_type="nf4"
+            ).to(device)
+            dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state)
+        else:
+            quantized_weight, max_abs, shape = quantizer.quantize_block(res)
+            dequantized_weight = quantizer.dequantize_block(quantized_weight, max_abs, shape)
+
+        res = weight - dequantized_weight
+
+        # Decompose the residual by SVD
+        output = _low_rank_decomposition(res, reduced_rank=reduced_rank)
+        L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"]
+        res = weight - torch.mm(L, R)
+
+    lora_A, lora_B = R, L
+
+    return dequantized_weight.to(dtype), lora_A, lora_B

From 2674f5ea66b43e07f08dabe2634aa9542d979211 Mon Sep 17 00:00:00 2001
From: zhangshengdong29 <435878393@qq.com>
Date: Thu, 30 Nov 2023 23:24:58 +0800
Subject: [PATCH 5/6] Megatron distributed parallel linear LoRA (#1092)

Adds option to use Megatron's ColumnParallelLinear and RowParallelLinear
for LoRA linear layers, leading to improved performance when using LoRA
with Megatron.
---
 src/peft/tuners/lora/config.py   |  26 +++++
 src/peft/tuners/lora/layer.py    |   3 +
 src/peft/tuners/lora/model.py    |  27 +++++
 src/peft/tuners/lora/tp_layer.py | 158 +++++++++++++++++++++++++++++
 tests/test_lora_megatron.py      | 167 +++++++++++++++++++++++++++++++
 5 files changed, 381 insertions(+)
 create mode 100644 src/peft/tuners/lora/tp_layer.py
 create mode 100644 tests/test_lora_megatron.py

diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
index 0dcca5c1e6..53269ebb8d 100644
--- a/src/peft/tuners/lora/config.py
+++ b/src/peft/tuners/lora/config.py
@@ -141,6 +141,32 @@ class LoraConfig(PeftConfig):
             )
         },
     )
+    megatron_config: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The TransformerConfig from Megatron, it is used to create LoRA's parallel linear layer."
+                "You can get it like this, `core_transformer_config_from_args(get_args())`, "
+                "this two functions are from Megatron."
+                "You need to specify this parameter when you want to loraize the ColumnParallelLinear and "
+                "RowParallelLinear layers of megatron."
+                "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` "
+                "functions, because TransformerConfig may not necessarily be serialized."
+                "But when using megatron, we can use `get_peft_model_state_dict` function and "
+                "megatron's framework, they can also save and load models and configurations."
+            )
+        },
+    )
+    megatron_core: Optional[str] = field(
+        default="megatron.core",
+        metadata={
+            "help": (
+                "The core module from Megatron, it is used to judge and create LoRA's parallel linear layer. "
+                "It only needs to be passed in when you need to use your own modified megatron core module. "
+                "Otherwise, it will use the default value `megatron.core`. "
+            )
+        },
+    )
     # dict type is used when loading config.json
     loftq_config: Union[LoftQConfig, dict] = field(
         default_factory=dict,
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
index cf97108c87..3219ca1e47 100644
--- a/src/peft/tuners/lora/layer.py
+++ b/src/peft/tuners/lora/layer.py
@@ -62,6 +62,9 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"):
             # QuantLinear
             in_features, out_features = base_layer.infeatures, base_layer.outfeatures
+        elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"):
+            # Megatron ColumnParallelLinear,RowParallelLinear
+            in_features, out_features = base_layer.input_size, base_layer.output_size
         else:
             raise ValueError(f"Unsupported layer type {type(base_layer)}")
 
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
index 6e0a64187a..4f6538e912 100644
--- a/src/peft/tuners/lora/model.py
+++ b/src/peft/tuners/lora/model.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import math
 import operator
 import re
@@ -259,6 +260,10 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
         else:
             target_base_layer = target
 
+        megatron_core = None
+        if lora_config.megatron_config:
+            megatron_core = importlib.import_module(lora_config.megatron_core)
+
         if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
             eightbit_kwargs = kwargs.copy()
             eightbit_kwargs.update(
@@ -300,6 +305,28 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
             kwargs.update(lora_config.loftq_config)
             new_module = Linear(target, adapter_name, **kwargs)
+        elif megatron_core and isinstance(
+            target_base_layer,
+            (megatron_core.tensor_parallel.ColumnParallelLinear, megatron_core.tensor_parallel.RowParallelLinear),
+        ):
+            from .tp_layer import LoraParallelLinear
+
+            megatron_kwargs = kwargs.copy()
+            megatron_config = lora_config.megatron_config
+            if isinstance(megatron_config, dict):
+                transformer_config_class = megatron_core.transformer.transformer_config.TransformerConfig
+                megatron_config = transformer_config_class(**lora_config.megatron_config)
+            megatron_kwargs["megatron_config"] = megatron_config
+            if megatron_kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `ColumnParallelLinear` "
+                    "or `RowParallelLinear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                megatron_kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            new_module = LoraParallelLinear(
+                base_layer=target, adapter_name=adapter_name, backend=megatron_core.tensor_parallel, **megatron_kwargs
+            )
         elif isinstance(target_base_layer, Conv1D):
             if not kwargs["fan_in_fan_out"]:
                 warnings.warn(
diff --git a/src/peft/tuners/lora/tp_layer.py b/src/peft/tuners/lora/tp_layer.py
new file mode 100644
index 0000000000..676430cf38
--- /dev/null
+++ b/src/peft/tuners/lora/tp_layer.py
@@ -0,0 +1,158 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from .layer import LoraLayer
+
+
+class LoraParallelLinear(nn.Module, LoraLayer):
+    """
+    When the target layer parallel_linear is RowParallelLinear, in order to keep the input and output shapes
+    consistent, we need to split the lora matrix A into rows, and the lora_B at this time should be a complete linear
+    layer; In the same way, when the target layer is ColumnParallelLinear, we perform column segmentation on lora_B,
+    while lora_A is still a complete linear layer.
+    """
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        backend,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,
+        init_lora_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer=base_layer)
+
+        self.backend = backend
+        self.is_paralle_a = isinstance(base_layer, backend.RowParallelLinear)
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+
+        megatron_config = kwargs["megatron_config"]
+        parallel_linear_kwargs = {"megatron_config": megatron_config}
+        init_method = init.xavier_normal_
+        if hasattr(megatron_config, "init_method"):
+            init_method = megatron_config.init_method
+        input_is_parallel = True
+        gather_output = False
+        if isinstance(base_layer, self.backend.RowParallelLinear):
+            input_is_parallel = base_layer.input_is_parallel
+        else:
+            gather_output = base_layer.gather_output
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha,
+            lora_dropout,
+            init_lora_weights,
+            init_method,
+            input_is_parallel,
+            gather_output,
+            **parallel_linear_kwargs,
+        )
+
+        self.is_target_conv_1d_layer = False
+
+    def update_layer(
+        self,
+        adapter_name,
+        r,
+        lora_alpha,
+        lora_dropout,
+        init_lora_weights,
+        init_method=init.xavier_normal_,
+        input_is_parallel=True,
+        gather_output=False,
+        **parallel_linear_kwargs,
+    ):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+
+        megatron_config = parallel_linear_kwargs["megatron_config"]
+        # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow
+        megatron_config.params_dtype = torch.float32
+        if self.is_paralle_a:
+            lora_a = self.backend.RowParallelLinear(
+                input_size=self.in_features,
+                output_size=r,
+                bias=False,
+                input_is_parallel=input_is_parallel,
+                skip_bias_add=True,
+                init_method=init_method,
+                config=megatron_config,
+            )
+            lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32)
+        else:
+            lora_a = nn.Linear(in_features=self.in_features, out_features=r, bias=False, dtype=torch.float32)
+            lora_b = self.backend.ColumnParallelLinear(
+                input_size=r,
+                output_size=self.out_features,
+                bias=False,
+                gather_output=gather_output,
+                init_method=init_method,
+                config=megatron_config,
+            )
+        self.lora_A[adapter_name] = lora_a
+        self.lora_B[adapter_name] = lora_b
+        self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
+        previous_dtype = x.dtype
+        # If weight is used for matrix multiplication here, the final aggregation operation of the original
+        # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
+        # output of the original parallel_linear layer.
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result, bias = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result, bias = self.base_layer(x, *args, **kwargs)
+        else:
+            result, bias = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+
+                lora_result = lora_A(dropout(x))
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_B(lora_result)
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_result * scaling
+
+                result = result + lora_result
+
+        result = result.to(previous_dtype)
+        return result, bias
diff --git a/tests/test_lora_megatron.py b/tests/test_lora_megatron.py
new file mode 100644
index 0000000000..80d0f43010
--- /dev/null
+++ b/tests/test_lora_megatron.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+import os
+import unittest
+
+import torch
+import torch.nn.init as init
+
+from peft import LoraConfig, PeftModel, get_peft_model, get_peft_model_state_dict
+
+
+def is_megatron_available() -> bool:
+    return importlib.util.find_spec("megatron") is not None
+
+
+if is_megatron_available():
+    from megatron.core import parallel_state, tensor_parallel
+    from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+    from megatron.core.transformer.module import MegatronModule
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+    world_size = 1
+    rank = 0
+
+    def initialize_distributed():
+        print(f"Initializing torch.distributed with rank: {rank}, world_size: {world_size}")
+        torch.cuda.set_device(0)
+        init_method = "tcp://"
+        master_ip = os.getenv("MASTER_ADDR", "localhost")
+        master_port = os.getenv("MASTER_PORT", "6001")
+        init_method += master_ip + ":" + master_port
+        torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank, init_method=init_method)
+
+    def destroy_model_parallel():
+        parallel_state.destroy_model_parallel()
+        torch.distributed.barrier()
+
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+    ):
+        parallel_state.destroy_model_parallel()
+        if not torch.distributed.is_initialized():
+            initialize_distributed()
+        parallel_state.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank,
+        )
+
+    class DummyModule(MegatronModule):
+        def __init__(self, config: TransformerConfig):
+            super().__init__(config)
+            self.linear = tensor_parallel.ColumnParallelLinear(
+                input_size=10,
+                output_size=10,
+                config=config,
+                init_method=init.xavier_normal_,
+                bias=False,
+                gather_output=False,
+            )
+            self.lm_head = tensor_parallel.RowParallelLinear(
+                input_size=10,
+                output_size=10,
+                config=config,
+                init_method=init.xavier_normal_,
+                bias=False,
+                input_is_parallel=True,
+            )
+
+        def forward(self, input):
+            x = self.linear(input)[0]
+            x = self.lm_head(x)[0]
+            return x
+
+    class TestMegatronLora(unittest.TestCase):
+        def setUp(self):
+            initialize_model_parallel(1, 1)
+            model_parallel_cuda_manual_seed(123)
+            transformer_config = {
+                "num_layers": 2,
+                "hidden_size": 12,
+                "num_attention_heads": 4,
+                "use_cpu_initialization": True,
+            }
+            config = TransformerConfig(**transformer_config)
+            self.megatron_module = DummyModule(config=config).cuda()
+            self.dummy_module = copy.deepcopy(self.megatron_module).cuda()
+
+            lora_config = LoraConfig(
+                lora_alpha=16,
+                lora_dropout=0.1,
+                r=64,
+                bias="none",
+                target_modules=["linear", "lm_head"],
+                megatron_config=config,
+                megatron_core="megatron.core",
+            )
+            self.megatron_module = get_peft_model(self.megatron_module, lora_config)
+
+        def tearDown(self):
+            destroy_model_parallel()
+
+        def test_megatron_lora_module(self):
+            megatron_module = self.megatron_module
+            self.assertTrue(isinstance(megatron_module, PeftModel))
+
+            for name, module in megatron_module.named_modules():
+                if name.endswith("linear"):
+                    self.assertTrue(hasattr(module, "lora_A"))
+                    self.assertTrue(hasattr(module, "lora_B"))
+                if name.endswith("linear.lora_A.default"):
+                    self.assertTrue(isinstance(module, torch.nn.Linear))
+                if name.endswith("linear.lora_B.default"):
+                    self.assertTrue(isinstance(module, tensor_parallel.ColumnParallelLinear))
+
+                if name.endswith("lm_head.lora_A.default"):
+                    self.assertTrue(isinstance(module, tensor_parallel.RowParallelLinear))
+                if name.endswith("lm_head.lora_B.default"):
+                    self.assertTrue(isinstance(module, torch.nn.Linear))
+
+        def test_forward(self):
+            x = torch.ones((2, 4, 10)).cuda()
+            megatron_module_result = self.megatron_module(x)
+            dummt_module_result = self.dummy_module(x)
+
+            # Because lora_B is initialized with 0, the forward results of two models should be equal before backward.
+            self.assertTrue(megatron_module_result.equal(dummt_module_result))
+
+        def test_backward(self):
+            optimizer = torch.optim.AdamW(self.megatron_module.parameters())
+            loss_fn = torch.nn.CrossEntropyLoss()
+
+            x = torch.randn(2, 4, 10, requires_grad=True).cuda()
+            label = torch.randint(10, (2 * 4,)).cuda()
+
+            output = self.megatron_module(x)
+            output = output.reshape(2 * 4, 10)
+            loss = loss_fn(output, label)
+
+            loss.backward()
+            optimizer.step()
+
+        def test_get_peft_model_state_dict(self):
+            peft_state_dict = get_peft_model_state_dict(self.megatron_module)
+
+            for key in peft_state_dict.keys():
+                self.assertTrue("lora" in key)

From da17ac0f484b28a8471004b47bddfc408969ae04 Mon Sep 17 00:00:00 2001
From: takuoko <to78314910@gmail.com>
Date: Fri, 1 Dec 2023 00:58:42 +0900
Subject: [PATCH 6/6] [Feature] Support OFT (#1160)

* Support OFT

* add test

* Update README

* fix code quality

* fix test

* Skip 1 test

* fix eps rule and add more test

* feat: added examples to new OFT method

* fix: removed wrong arguments from model example

* fix: changed name of inference file

* fix: changed prompt variable

* fix docs

* fix: dreambooth inference revision based on feedback

* fix: review from BenjaminBossan

* apply safe merge

* del partially

* refactor oft

* refactor oft

* del unused line

* del unused line

* fix skip in windows

* skip test

* Add comments about bias added place

* rename orig_weights to new_weights

* use inverse instead of linalg.inv

* delete alpha and scaling

---------

Co-authored-by: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Co-authored-by: Lukas Kuhn <lukas.kuhn@deutschebahn.com>
---
 README.md                                     |    7 +-
 .../oft_dreambooth_inference.ipynb            |   89 ++
 examples/oft_dreambooth/train_dreambooth.py   | 1112 +++++++++++++++++
 src/peft/__init__.py                          |    2 +
 src/peft/mapping.py                           |    4 +
 src/peft/peft_model.py                        |    2 +
 src/peft/tuners/__init__.py                   |    1 +
 src/peft/tuners/oft/__init__.py               |   21 +
 src/peft/tuners/oft/config.py                 |  109 ++
 src/peft/tuners/oft/layer.py                  |  375 ++++++
 src/peft/tuners/oft/model.py                  |  108 ++
 src/peft/utils/peft_types.py                  |    1 +
 src/peft/utils/save_and_load.py               |    5 +-
 tests/test_config.py                          |    4 +-
 tests/test_custom_models.py                   |  103 +-
 tests/test_stablediffusion.py                 |   23 +-
 tests/testing_common.py                       |    6 +-
 17 files changed, 1959 insertions(+), 13 deletions(-)
 create mode 100644 examples/oft_dreambooth/oft_dreambooth_inference.ipynb
 create mode 100644 examples/oft_dreambooth/train_dreambooth.py
 create mode 100644 src/peft/tuners/oft/__init__.py
 create mode 100644 src/peft/tuners/oft/config.py
 create mode 100644 src/peft/tuners/oft/layer.py
 create mode 100644 src/peft/tuners/oft/model.py

diff --git a/README.md b/README.md
index 79259f98ee..09846dc61c 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,7 @@ Supported methods:
 8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
 9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
 10. LoftQ: [LoftQ: LoRA-Fine-Tuning-aware Quantization for Large Language Models](https://arxiv.org/abs/2310.08659)
+11. OFT: [Controlling Text-to-Image Diffusion by Orthogonal Finetuning](https://arxiv.org/abs/2306.07280)
 
 ## Getting started
 
@@ -278,9 +279,9 @@ Find models that are supported out of the box below. Note that PEFT works with a
 
 ### Text-to-Image Generation
 
-|   Model         | LoRA | LoHa | LoKr | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
-| --------- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
-| Stable Diffusion           | ✅  | ✅  | ✅  |  |   |   |
+|   Model         | LoRA | LoHa | LoKr | OFT | Prefix Tuning  | P-Tuning | Prompt Tuning  | IA3 |
+| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ----  | ----  |
+| Stable Diffusion           | ✅  | ✅  | ✅  | ✅  |  |   |   |
 
 
 ### Image Classification
diff --git a/examples/oft_dreambooth/oft_dreambooth_inference.ipynb b/examples/oft_dreambooth/oft_dreambooth_inference.ipynb
new file mode 100644
index 0000000000..4a28c4040e
--- /dev/null
+++ b/examples/oft_dreambooth/oft_dreambooth_inference.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "acd7b15e",
+   "metadata": {},
+   "source": [
+    "# Dreambooth with OFT\n",
+    "This Notebook assumes that you already ran the train_dreambooth.py script to create your own adapter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acab479f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import DiffusionPipeline\n",
+    "from diffusers.utils import check_min_version, get_logger\n",
+    "from peft import PeftModel\n",
+    "\n",
+    "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n",
+    "check_min_version(\"0.10.0.dev0\")\n",
+    "\n",
+    "logger = get_logger(__name__)\n",
+    "\n",
+    "BASE_MODEL_NAME = \"stabilityai/stable-diffusion-2-1-base\"\n",
+    "ADAPTER_MODEL_PATH = \"INSERT MODEL PATH HERE\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = DiffusionPipeline.from_pretrained(\n",
+    "    BASE_MODEL_NAME,\n",
+    ")\n",
+    "pipe.to('cuda')\n",
+    "pipe.unet = PeftModel.from_pretrained(pipe.unet, ADAPTER_MODEL_PATH + \"/unet\", adapter_name=\"default\")\n",
+    "pipe.text_encoder = PeftModel.from_pretrained(pipe.text_encoder, ADAPTER_MODEL_PATH + \"/text_encoder\", adapter_name=\"default\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"A photo of a sks dog\"\n",
+    "image = pipe(\n",
+    "    prompt,\n",
+    "    num_inference_steps=50,\n",
+    "    height=512,\n",
+    "    width=512,\n",
+    ").images[0]\n",
+    "image"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/oft_dreambooth/train_dreambooth.py b/examples/oft_dreambooth/train_dreambooth.py
new file mode 100644
index 0000000000..cacce70647
--- /dev/null
+++ b/examples/oft_dreambooth/train_dreambooth.py
@@ -0,0 +1,1112 @@
+import argparse
+import gc
+import hashlib
+import itertools
+import logging
+import math
+import os
+import threading
+import warnings
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Optional
+
+import datasets
+import diffusers
+import numpy as np
+import psutil
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+from peft import get_peft_model
+from peft.tuners.oft.config import OFTConfig
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+
+logger = get_logger(__name__)
+
+UNET_TARGET_MODULES = ["to_q", "to_v", "query", "value"]  # , "ff.net.0.proj"]
+TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"]
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=100,
+        help=(
+            "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+
+    # oft args
+    parser.add_argument("--use_oft", action="store_true", help="Whether to use OFT for parameter efficient tuning")
+    parser.add_argument("--oft_r", type=int, default=8, help="OFT rank, only used if use_oft is True")
+    parser.add_argument("--oft_alpha", type=int, default=32, help="OFT alpha, only used if use_oft is True")
+    parser.add_argument("--oft_dropout", type=float, default=0.0, help="OFT dropout, only used if use_oft is True")
+    parser.add_argument(
+        "--oft_use_coft", action="store_true", help="Using constrained OFT, only used if use_oft is True"
+    )
+    parser.add_argument(
+        "--oft_eps",
+        type=float,
+        default=0.0,
+        help="The control strength of COFT. Only has an effect if `oft_use_coft` is set to True.",
+    )
+
+    parser.add_argument(
+        "--oft_text_encoder_r",
+        type=int,
+        default=8,
+        help="OFT rank for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_alpha",
+        type=int,
+        default=32,
+        help="OFT alpha for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_dropout",
+        type=float,
+        default=0.0,
+        help="OFT dropout for text encoder, only used if `use_oft` and `train_text_encoder` are True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_use_coft",
+        action="store_true",
+        help="Using constrained OFT on the text encoder, only used if use_oft is True",
+    )
+    parser.add_argument(
+        "--oft_text_encoder_eps",
+        type=float,
+        default=0.0,
+        help="The control strength of COFT on the text encoder. Only has an effect if `oft_text_encoder_use_coft` is set to True.",
+    )
+
+    parser.add_argument(
+        "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
+    )
+
+    parser.add_argument(
+        "--no_tracemalloc",
+        default=False,
+        action="store_true",
+        help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.",
+    )
+
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--wandb_key",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, api-key for wandb used for login to wandb "),
+    )
+    parser.add_argument(
+        "--wandb_project_name",
+        type=str,
+        default=None,
+        help=("If report to option is set to wandb, project name in wandb for log tracking  "),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+# Converting Bytes to Megabytes
+def b2mb(x):
+    return int(x / 2**20)
+
+
+# This context manager is used to track the peak memory usage of the process
+class TorchTracemalloc:
+    def __enter__(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
+        self.begin = torch.cuda.memory_allocated()
+        self.process = psutil.Process()
+
+        self.cpu_begin = self.cpu_mem_used()
+        self.peak_monitoring = True
+        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
+        peak_monitor_thread.daemon = True
+        peak_monitor_thread.start()
+        return self
+
+    def cpu_mem_used(self):
+        """get resident set size memory for the current process"""
+        return self.process.memory_info().rss
+
+    def peak_monitor_func(self):
+        self.cpu_peak = -1
+
+        while True:
+            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)
+
+            # can't sleep or will not catch the peak right (this comment is here on purpose)
+            # time.sleep(0.001) # 1msec
+
+            if not self.peak_monitoring:
+                break
+
+    def __exit__(self, *exc):
+        self.peak_monitoring = False
+
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.end = torch.cuda.memory_allocated()
+        self.peak = torch.cuda.max_memory_allocated()
+        self.used = b2mb(self.end - self.begin)
+        self.peaked = b2mb(self.peak - self.begin)
+
+        self.cpu_end = self.cpu_mem_used()
+        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
+        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
+        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def collate_fn(examples, with_prior_preservation=False):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+    input_ids = torch.cat(input_ids, dim=0)
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_dir=logging_dir,
+    )
+    if args.report_to == "wandb":
+        import wandb
+
+        wandb.login(key=args.wandb_key)
+        wandb.init(project=args.wandb_project_name)
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)  # noqa: F841
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )  # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    if args.use_oft:
+        config = OFTConfig(
+            r=args.oft_r,
+            alpha=args.oft_alpha,
+            target_modules=UNET_TARGET_MODULES,
+            module_dropout=args.oft_dropout,
+            init_weights=True,
+            coft=args.oft_use_coft,
+            eps=args.oft_eps,
+        )
+        unet = get_peft_model(unet, config)
+        unet.print_trainable_parameters()
+        print(unet)
+
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+    elif args.train_text_encoder and args.use_oft:
+        config = OFTConfig(
+            r=args.oft_text_encoder_r,
+            alpha=args.oft_text_encoder_alpha,
+            target_modules=TEXT_ENCODER_TARGET_MODULES,
+            module_dropout=args.oft_text_encoder_dropout,
+            init_weights=True,
+            coft=args.oft_text_encoder_use_coft,
+            eps=args.oft_text_encoder_eps,
+        )
+        text_encoder = get_peft_model(text_encoder, config)
+        text_encoder.print_trainable_parameters()
+        print(text_encoder)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        # below fails when using oft so commenting it out
+        if args.train_text_encoder and not args.use_oft:
+            text_encoder.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.num_dataloader_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move vae and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1]
+        accelerator.print(f"Resuming from checkpoint {path}")
+        accelerator.load_state(os.path.join(args.output_dir, path))
+        global_step = int(path.split("-")[1])
+
+        resume_global_step = global_step * args.gradient_accumulation_steps
+        first_epoch = resume_global_step // num_update_steps_per_epoch
+        resume_step = resume_global_step % num_update_steps_per_epoch
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc:
+            for step, batch in enumerate(train_dataloader):
+                # Skip steps until we reach the resumed step
+                if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
+                        if args.report_to == "wandb":
+                            accelerator.print(progress_bar)
+                    continue
+
+                with accelerator.accumulate(unet):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * 0.18215
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    )
+                    timesteps = timesteps.long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    # Get the target for loss depending on the prediction type
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        target = noise
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                    if args.with_prior_preservation:
+                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                        target, target_prior = torch.chunk(target, 2, dim=0)
+
+                        # Compute instance loss
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                        # Compute prior loss
+                        prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                        # Add the prior loss to the instance loss.
+                        loss = loss + args.prior_loss_weight * prior_loss
+                    else:
+                        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        params_to_clip = (
+                            itertools.chain(unet.parameters(), text_encoder.parameters())
+                            if args.train_text_encoder
+                            else unet.parameters()
+                        )
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    if args.report_to == "wandb":
+                        accelerator.print(progress_bar)
+                    global_step += 1
+
+                logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                accelerator.log(logs, step=global_step)
+
+                if (
+                    args.validation_prompt is not None
+                    and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0
+                ):
+                    logger.info(
+                        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                        f" {args.validation_prompt}."
+                    )
+                    # create pipeline
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    # set `keep_fp32_wrapper` to True because we do not want to remove
+                    # mixed precision hooks while we are still training
+                    pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+                    pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+                    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                    pipeline = pipeline.to(accelerator.device)
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    # run inference
+                    if args.seed is not None:
+                        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                    else:
+                        generator = None
+                    images = []
+                    for _ in range(args.num_validation_images):
+                        image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                        images.append(image)
+
+                    for tracker in accelerator.trackers:
+                        if tracker.name == "tensorboard":
+                            np_images = np.stack([np.asarray(img) for img in images])
+                            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                        if tracker.name == "wandb":
+                            import wandb
+
+                            tracker.log(
+                                {
+                                    "validation": [
+                                        wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                        for i, image in enumerate(images)
+                                    ]
+                                }
+                            )
+
+                    del pipeline
+                    torch.cuda.empty_cache()
+
+                if global_step >= args.max_train_steps:
+                    break
+        # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
+
+        if not args.no_tracemalloc:
+            accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+            accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+            accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
+            accelerator.print(
+                "GPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.peaked + b2mb(tracemalloc.begin)
+                )
+            )
+
+            accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
+            accelerator.print(
+                "CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used)
+            )
+            accelerator.print(
+                "CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked)
+            )
+            accelerator.print(
+                "CPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+                )
+            )
+
+    # Create the pipeline using using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        if args.use_oft:
+            unwarpped_unet = accelerator.unwrap_model(unet)
+            unwarpped_unet.save_pretrained(
+                os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet)
+            )
+            if args.train_text_encoder:
+                unwarpped_text_encoder = accelerator.unwrap_model(text_encoder)
+                unwarpped_text_encoder.save_pretrained(
+                    os.path.join(args.output_dir, "text_encoder"),
+                    state_dict=accelerator.get_state_dict(text_encoder),
+                )
+        else:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                unet=accelerator.unwrap_model(unet),
+                text_encoder=accelerator.unwrap_model(text_encoder),
+                revision=args.revision,
+            )
+            pipeline.save_pretrained(args.output_dir)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 4d9380e697..75ddda498c 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -68,6 +68,8 @@
     PromptTuningInit,
     MultitaskPromptTuningConfig,
     MultitaskPromptTuningInit,
+    OFTConfig,
+    OFTModel,
 )
 from .utils import (
     TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
diff --git a/src/peft/mapping.py b/src/peft/mapping.py
index f69e89ec3e..60503fa985 100644
--- a/src/peft/mapping.py
+++ b/src/peft/mapping.py
@@ -42,6 +42,8 @@
     LoraConfig,
     LoraModel,
     MultitaskPromptTuningConfig,
+    OFTConfig,
+    OFTModel,
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptTuningConfig,
@@ -73,6 +75,7 @@
     "ADALORA": AdaLoraConfig,
     "IA3": IA3Config,
     "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
+    "OFT": OFTConfig,
 }
 
 PEFT_TYPE_TO_TUNER_MAPPING = {
@@ -81,6 +84,7 @@
     "LOKR": LoKrModel,
     "ADALORA": AdaLoraModel,
     "IA3": IA3Model,
+    "OFT": OFTModel,
 }
 
 
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
index 24ef48c22e..79bf8e4610 100644
--- a/src/peft/peft_model.py
+++ b/src/peft/peft_model.py
@@ -44,6 +44,7 @@
     LoKrModel,
     LoraModel,
     MultitaskPromptEmbedding,
+    OFTModel,
     PrefixEncoder,
     PromptEmbedding,
     PromptEncoder,
@@ -77,6 +78,7 @@
     PeftType.ADALORA: AdaLoraModel,
     PeftType.ADAPTION_PROMPT: AdaptionPromptModel,
     PeftType.IA3: IA3Model,
+    PeftType.OFT: OFTModel,
 }
 
 
diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
index 666e29d997..f5f665dd99 100644
--- a/src/peft/tuners/__init__.py
+++ b/src/peft/tuners/__init__.py
@@ -27,3 +27,4 @@
 from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
 from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
 from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+from .oft import OFTConfig, OFTModel
diff --git a/src/peft/tuners/oft/__init__.py b/src/peft/tuners/oft/__init__.py
new file mode 100644
index 0000000000..456c46ee07
--- /dev/null
+++ b/src/peft/tuners/oft/__init__.py
@@ -0,0 +1,21 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import OFTConfig
+from .layer import Conv2d, Linear, OFTLayer
+from .model import OFTModel
+
+
+__all__ = ["OFTConfig", "OFTModel", "Conv2d", "Linear", "OFTLayer"]
diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
new file mode 100644
index 0000000000..6b43255d1d
--- /dev/null
+++ b/src/peft/tuners/oft/config.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+
+
+@dataclass
+class OFTConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`OFTModel`].
+
+    Args:
+        r (`int`): OFT rank.
+        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply OFT to.
+        init_weights (`bool`): Whether to perform initialization of OFT weights.
+        layers_to_transform (`Union[List[int],int]`):
+            The layer indexes to transform, if this argument is specified, it will apply the OFT transformations on the
+            layer indexes that are specified in this list. If a single integer is passed, it will apply the OFT
+            transformations on the layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer
+            pattern is not in the common layers pattern.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        modules_to_save (`List[str]`): The names of modules to be set as trainable except OFT parameters.
+        coft (`bool`): Whether to use the constrainted variant of OFT or not.
+        eps (`float`):
+            The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+        block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+    """
+
+    r: int = field(default=8, metadata={"help": "OFT rank"})
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with OFT."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the OFT layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    coft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the constrainted variant of OFT or not."},
+    )
+    eps: float = field(
+        default=6e-5,
+        metadata={
+            "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True."
+        },
+    )
+    block_share: bool = field(
+        default=False,
+        metadata={"help": "Whether to share the OFT parameters between blocks or not."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.OFT
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/src/peft/tuners/oft/layer.py b/src/peft/tuners/oft/layer.py
new file mode 100644
index 0000000000..b9e0d011b3
--- /dev/null
+++ b/src/peft/tuners/oft/layer.py
@@ -0,0 +1,375 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Any, List, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+
+from peft.tuners.lycoris_utils import LycorisLayer
+
+
+class OFTLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("oft_r",)
+    # other_param_names is defined on parent class
+
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+
+        # OFT info
+        self.oft_r = nn.ParameterDict({})
+        self.coft = {}
+        self.eps = {}
+        self.block_share = {}
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.oft_r}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool):
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+        else:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        nn.init.zeros_(self.oft_r[adapter_name])
+
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        module_dropout: float,
+        init_weights: bool,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create oft adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            coft (`bool`): Whether to use the constrainted variant of OFT or not.
+            eps (`float`):
+                The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+            block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+        """
+
+        self.r[adapter_name] = r
+        self.module_dropout[adapter_name] = module_dropout
+        self.coft[adapter_name] = coft
+        self.block_share[adapter_name] = block_share
+
+        # Determine shape of OFT weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            shape = (
+                base_layer.out_channels,
+                base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+            )
+        else:
+            raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}")
+
+        self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r)
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, block_share)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def unscale_layer(self, scale=None) -> None:
+        # scale is not used
+        pass
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        if self.merged:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                f"You are now additionally merging {','.join(self.active_adapters)}."
+            )
+        if adapter_names is None:
+            adapter_names = self.active_adapters
+
+        for active_adapter in adapter_names:
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+
+                orig_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = orig_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if orig_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]]
+                new_weights = torch.mm(orig_weights, delta_weight)
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+
+                if safe_merge and not torch.isfinite(new_weights).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                base_layer.weight.data = new_weights
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                new_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if new_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]]
+                delta_inv = torch.inverse(delta_weight)
+                orig_weights = torch.mm(new_weights, delta_inv)
+
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights.reshape(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+                base_layer.weight.data = orig_weights
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        rank = self.r[adapter_name]
+        coft = self.coft[adapter_name]
+        eps = self.eps[adapter_name]
+        opt_r = self.oft_r[adapter_name]
+
+        if coft:
+            with torch.no_grad():
+                opt_r.copy_(self._project_batch(opt_r, eps=eps))
+
+        orth_rotate = self._cayley_batch(opt_r)
+        weight = self._block_diagonal(orth_rotate, rank)
+
+        return weight
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144
+    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)
+
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+
+        return Q
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
+    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
+        if oft_r.shape[0] == 1:
+            # block share
+            blocks = [oft_r[0, ...] for i in range(rank)]
+        else:
+            blocks = [oft_r[i, ...] for i in range(rank)]
+
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+
+        return A
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
+    def _project_batch(self, oft_r, eps=1e-5):
+        # scaling factor for each of the smaller block matrix
+        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
+        I = (
+            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
+            .unsqueeze(0)
+            .expand_as(oft_r)
+        )
+        diff = oft_r - I
+        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
+        mask = (norm_diff <= eps).bool()
+        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
+        return out
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            if len(result.shape) == 4:
+                result = result.permute(0, 2, 3, 1)
+
+            base_layer = self.get_base_layer()
+            base_bias = base_layer.bias
+            if base_bias is not None:
+                # Bias should be added after OFT forward
+                result = result - base_bias.data
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = self._get_delta_activations(active_adapter, result, *args, **kwargs)
+
+            if base_bias is not None:
+                result = result + base_bias.data
+            if len(result.shape) == 4:
+                result = result.permute(0, 3, 1, 2)
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Linear(OFTLayer):
+    """OFT implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
+
+
+class Conv2d(OFTLayer):
+    """OFT implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
diff --git a/src/peft/tuners/oft/model.py b/src/peft/tuners/oft/model.py
new file mode 100644
index 0000000000..4b7953daa9
--- /dev/null
+++ b/src/peft/tuners/oft/model.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, Type, Union
+
+import torch
+from torch import nn
+
+from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+
+from .layer import Conv2d, Linear, OFTLayer
+
+
+class OFTModel(LycorisTuner):
+    """
+    Creates Orthogonal Finetuning model from a pretrained model. The method is described in
+    https://arxiv.org/abs/2306.07280
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`OFTConfig`]): The configuration of the OFT model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The OFT model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import OFTModel, OFTConfig
+
+        >>> config_te = OFTConfig(
+        ...     r=8,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = OFTConfig(
+        ...     r=8,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default")
+        >>> model.unet = OFTModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`OFTConfig`]): The configuration of the OFT model.
+    """
+
+    prefix: str = "oft_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[OFTLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+        **optional_kwargs,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(config.rank_pattern.keys())
+        target_name_key = next(filter(lambda key: re.match(f"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+
+        if isinstance(target, OFTLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index 29c764a08f..93b892d9e5 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -30,6 +30,7 @@ class PeftType(str, enum.Enum):
     IA3 = "IA3"
     LOHA = "LOHA"
     LOKR = "LOKR"
+    OFT = "OFT"
 
 
 class TaskType(str, enum.Enum):
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
index 97bde0d6fe..c5da274085 100644
--- a/src/peft/utils/save_and_load.py
+++ b/src/peft/utils/save_and_load.py
@@ -113,6 +113,8 @@ def get_peft_model_state_dict(
         to_return["prompt_embeddings"] = prompt_embeddings
     elif config.peft_type == PeftType.IA3:
         to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k}
+    elif config.peft_type == PeftType.OFT:
+        to_return = {k: state_dict[k] for k in state_dict if "oft_" in k}
     else:
         raise NotImplementedError
     if getattr(model, "modules_to_save", None) is not None:
@@ -166,7 +168,7 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
     else:
         state_dict = peft_model_state_dict
 
-    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3):
+    if config.peft_type in (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.IA3, PeftType.OFT):
         peft_model_state_dict = {}
         parameter_prefix = {
             PeftType.IA3: "ia3_",
@@ -174,6 +176,7 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
             PeftType.ADALORA: "lora_",
             PeftType.LOHA: "hada_",
             PeftType.LOKR: "lokr_",
+            PeftType.OFT: "oft_",
         }[config.peft_type]
         for k, v in state_dict.items():
             if parameter_prefix in k:
diff --git a/tests/test_config.py b/tests/test_config.py
index 34f04232a9..06e72dae8e 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -30,6 +30,7 @@
     LoHaConfig,
     LoraConfig,
     MultitaskPromptTuningConfig,
+    OFTConfig,
     PeftConfig,
     PrefixTuningConfig,
     PromptEncoder,
@@ -51,6 +52,7 @@
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptTuningConfig,
+    OFTConfig,
 )
 
 
@@ -189,7 +191,7 @@ def test_prompt_encoder_warning_num_layers(self):
         expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
         assert str(record.list[0].message) == expected_msg
 
-    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config])
+    @parameterized.expand([LoHaConfig, LoraConfig, IA3Config, OFTConfig])
     def test_save_pretrained_with_target_modules(self, config_class):
         # See #1041, #1045
         config = config_class(target_modules=["a", "list"])
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index b298388a84..4785526b26 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -24,7 +24,7 @@
 from torch import nn
 from transformers.pytorch_utils import Conv1D
 
-from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoKrConfig, LoraConfig, PeftModel, get_peft_model
+from peft import AdaLoraConfig, IA3Config, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, PeftModel, get_peft_model
 from peft.tuners.tuners_utils import BaseTunerLayer
 
 from .testing_common import PeftCommonTester
@@ -191,6 +191,28 @@
             "decompose_factor": 4,
         },
     ),
+    ########
+    # OFT #
+    ########
+    ("Vanilla MLP 1 OFT", "MLP", OFTConfig, {"target_modules": "lin0"}),
+    ("Vanilla MLP 2 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"]}),
+    ("Vanilla MLP 5 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}),
+    (
+        "Vanilla MLP 6 OFT",
+        "MLP",
+        OFTConfig,
+        {
+            "target_modules": ["lin0"],
+            "module_dropout": 0.1,
+        },
+    ),
+    ("Vanilla MLP 7 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True}),
+    ("Vanilla MLP 8 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "block_share": True}),
+    ("Vanilla MLP 9 OFT", "MLP", OFTConfig, {"target_modules": ["lin0"], "coft": True, "block_share": True}),
+    ("Conv2d 1 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"]}),
+    ("Conv2d 3 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True}),
+    ("Conv2d 4 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "block_share": True}),
+    ("Conv2d 5 OFT", "Conv2d", OFTConfig, {"target_modules": ["conv2d"], "coft": True, "block_share": True}),
 ]
 
 MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES = [
@@ -258,6 +280,7 @@
     LoraConfig: "lora_",
     LoHaConfig: "hada_",
     LoKrConfig: "lokr_",
+    OFTConfig: "oft_",
 }
 
 
@@ -833,6 +856,7 @@ def test_targeting_lora_to_embedding_layer_non_transformers(self, save_embedding
             LoHaConfig(target_modules=["lin0"], init_weights=False),
             AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False),
             IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False),
+            OFTConfig(target_modules=["lin0"], init_weights=False),
         ]
     )
     def test_adapter_name_makes_no_difference(self, config0):
@@ -1852,3 +1876,80 @@ def test_requires_grad_lokr_same_targets(self):
             "base_model.model.lin0.lokr_w1.adapter1",
             "base_model.model.lin0.lokr_w2.adapter1",
         )
+
+    def test_requires_grad_oft_different_targets(self):
+        # test two different OFT adapters that target different modules
+        config0 = OFTConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = OFTConfig(target_modules=["lin1"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active pter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # change activate pter to pter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.oft_r.adapter1",
+        )
+
+        # disable all pters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin1.oft_r.adapter1",
+        )
+
+    def test_requires_grad_oft_same_targets(self):
+        # same as previous test, except that OFT adapters target the same layer
+        config0 = OFTConfig(target_modules=["lin0"])
+        peft_model = get_peft_model(MLP(), config0)
+
+        config1 = OFTConfig(target_modules=["lin0"], inference_mode=True)
+        peft_model.add_adapter("adapter1", config1)
+
+        # active adapter is still "default"
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # set config0 as active, should not change anything
+        peft_model.set_adapter("default")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.default",
+        )
+
+        # change activate adapter to adapter1
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.adapter1",
+        )
+
+        # disable all adapters
+        with peft_model.disable_adapter():
+            self.check_requires_grad(peft_model)
+
+        # after context is exited, return to the previous state
+        peft_model.set_adapter("adapter1")
+        self.check_requires_grad(
+            peft_model,
+            "base_model.model.lin0.oft_r.adapter1",
+        )
diff --git a/tests/test_stablediffusion.py b/tests/test_stablediffusion.py
index 830614a7ab..660c17caea 100644
--- a/tests/test_stablediffusion.py
+++ b/tests/test_stablediffusion.py
@@ -20,7 +20,7 @@
 from diffusers import StableDiffusionPipeline
 from parameterized import parameterized
 
-from peft import LoHaConfig, LoraConfig, get_peft_model
+from peft import LoHaConfig, LoraConfig, OFTConfig, get_peft_model
 
 from .testing_common import ClassInstantier, PeftCommonTester
 from .testing_utils import temp_seed
@@ -60,11 +60,24 @@
             "module_dropout": 0.0,
         },
     },
+    {
+        "text_encoder": {
+            "r": 8,
+            "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+            "module_dropout": 0.0,
+        },
+        "unet": {
+            "r": 8,
+            "target_modules": ["proj_in", "proj_out", "to_k", "to_q", "to_v", "to_out.0", "ff.net.0.proj", "ff.net.2"],
+            "module_dropout": 0.0,
+        },
+    },
 )
 CLASSES_MAPPING = {
     "lora": (LoraConfig, CONFIG_TESTING_KWARGS[0]),
     "loha": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
     "lokr": (LoHaConfig, CONFIG_TESTING_KWARGS[1]),
+    "oft": (OFTConfig, CONFIG_TESTING_KWARGS[2]),
 }
 
 
@@ -115,13 +128,14 @@ def prepare_inputs_for_testing(self):
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "loha_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
             },
         )
     )
     def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
-        if config_cls == LoHaConfig:
+        if config_cls in [LoHaConfig, OFTConfig]:
             # TODO: This test is flaky with PyTorch 2.1 on Windows, we need to figure out what is going on
-            self.skipTest("LoHaConfig test is flaky")
+            self.skipTest("LoHaConfig and OFTConfig test is flaky")
 
         # Instantiate model & adapters
         model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs)
@@ -148,7 +162,7 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs):
                 "model_ids": PEFT_DIFFUSERS_SD_MODELS_TO_TEST,
                 "lora_kwargs": {"init_lora_weights": [False]},
             },
-            filter_params_func=lambda tests: [x for x in tests if all(s not in x[0] for s in ["loha", "lokr"])],
+            filter_params_func=lambda tests: [x for x in tests if all(s not in x[0] for s in ["loha", "lokr", "oft"])],
         )
     )
     def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_cls, config_kwargs):
@@ -178,6 +192,7 @@ def test_add_weighted_adapter_base_unchanged(self, test_name, model_id, config_c
                 "lora_kwargs": {"init_lora_weights": [False]},
                 "loha_kwargs": {"init_weights": [False]},
                 "lokr_kwargs": {"init_weights": [False]},
+                "oft_kwargs": {"init_weights": [False]},
             },
         )
     )
diff --git a/tests/testing_common.py b/tests/testing_common.py
index 00809c2bc1..0c081cde2c 100644
--- a/tests/testing_common.py
+++ b/tests/testing_common.py
@@ -574,7 +574,7 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs):
         self.assertTrue(torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol))
 
     def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs):
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
 
         if ("gpt2" in model_id.lower()) and (config_cls == IA3Config):
             self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)")
@@ -886,7 +886,7 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar
             self.assertIsNotNone(param.grad)
 
     def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(
@@ -924,7 +924,7 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs):
 
     def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs):
         # same as test_delete_adapter, but this time an inactive adapter is deleted
-        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3]
+        supported_peft_types = [PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.IA3, PeftType.OFT]
         # IA3 does not support deleting adapters yet, but it just needs to be added
         # AdaLora does not support multiple adapters
         config = config_cls(