Skip to content

Commit

Permalink
Add LoftQ initialization method for LoRA (huggingface#1150)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
  • Loading branch information
3 people committed Nov 30, 2023
1 parent 33717ba commit 05819db
Show file tree
Hide file tree
Showing 12 changed files with 1,514 additions and 12 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Supported methods:
7. MultiTask Prompt Tuning: [Multitask Prompt Tuning Enables Parameter-Efficient Transfer Learning](https://arxiv.org/abs/2303.02861)
8. LoHa: [FedPara: Low-Rank Hadamard Product for Communication-Efficient Federated Learning](https://arxiv.org/abs/2108.06098)
9. LoKr: [KronA: Parameter Efficient Tuning with Kronecker Adapter](https://arxiv.org/abs/2212.10650) based on [Navigating Text-To-Image Customization:From LyCORIS Fine-Tuning to Model Evaluation](https://arxiv.org/abs/2309.14859) implementation
10. LoftQ: [LoftQ: LoRA-Fine-Tuning-aware Quantization for Large Language Models](https://arxiv.org/abs/2310.08659)

## Getting started

Expand Down
69 changes: 69 additions & 0 deletions examples/loftq_finetuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# LoftQ: LoRA-fine-tuning-aware Quantization

## Introduction

LoftQ provides better initialization for LoRA adapters A and B,
and the Quantization of pre-trained weights W.

## Quantization
We recommend to save the quantized backbone model as fp16/fp32
and load it as [NormalFloat4](https://arxiv.org/abs/2305.14314).

We provide a simple example to show how to quantize llama-2-7b model and save/load it.

```sh
python quantize_save_load.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--token HF_TOKEN \
--bits 4 --iter 5 --rank 16 \
--save_dir model_zoo/loftq/
```

- `HF_TOKEN` is the token used to access to [LLAMA models](https://huggingface.co/meta-llama).
- `quantize_and_save()` function will quantize the backbone and initialize LoRA adapters.
It creates 2 folders under `$save_dir`. The quantized backbone is at `Llama-2-7b-hf-4bit-16rank`,
and the LoRA adapters are at the sub-folder `Llama-2-7b-hf-4bit-16rank/loftq_init`.

## Fine-tuning

Here is an example to load the quantized backbone and LoRA adapters:

```python
import os

from transformers import AutoModelForCausalLM
from peft import PeftModel


base_model = AutoModelForCausalLM.from_pretrained(
os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank"),
load_in_4bit=True,
)
peft_model = PeftModel.from_pretrained(
base_model,
os.path.join(args.save_dir, "Llama-2-7b-hf-4bit-16rank", "loftq_init"),
is_trainable=True,
)
```

We also provide an example to fine-tune LoftQ on GSM8K.
We load the quantized backbone and LoRA adapters from the [LoftQ Huggingface hub](https://huggingface.co/LoftQ).

```sh
python train_gsm8k_llama.py \
--model_name_or_path LoftQ/Llama-2-7b-hf-4bit-64rank \
--output_dir exp_results/gsm8k/llama-2-7b/bit4-rank64/lr3e-4 \
--learning_rate 3e-4 \
--seed 202 \
--dataset_name gsm8k \
--dataset_config main \
--pad_to_max_length \
--max_source_length 128 \
--max_target_length 256 \
--num_train_epochs 5 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 4 \
--with_tracking \
--report_to tensorboard
```
244 changes: 244 additions & 0 deletions examples/loftq_finetuning/quantize_save_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
# coding=utf-8
# Copyright 2023-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os

import torch
import torch.nn as nn
from transformers import (
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoTokenizer,
BitsAndBytesConfig,
)

from peft import LoftQConfig, LoraConfig, PeftModel, TaskType, get_peft_model


class Shell(nn.Module):
def __init__(self, weight, bias=None):
super().__init__()
self.weight = nn.Parameter(weight, requires_grad=False)
if bias is not None:
self.bias = nn.Parameter(bias, requires_grad=False)


def unwarap_model(model, sub_module_name=".base_layer"):
sub_module_name_list = [k.split(sub_module_name)[0] for k in model.state_dict().keys() if sub_module_name in k]
sub_module_name_set = set(sub_module_name_list)
for name in sub_module_name_set:
# get the parent of the submodule
name_parent = ".".join(name.split(".")[:-1])
name_child = name.split(".")[-1]
sub_module = model.get_submodule(name_parent)
print(sub_module)

# replace with shell
child = getattr(sub_module, name_child)
weight = getattr(child.base_layer, "weight", None)
bias = getattr(child.base_layer, "bias", None)
shell = Shell(weight, bias)

setattr(sub_module, name_child, shell)

print("You have unwrapped the model. Use it on your own risk.")


def print_model(model, name):
print("=" * 10 + name + "=" * 10)
print(model)
for name, param in model.named_parameters():
if torch.is_tensor(param):
if param.dtype in [torch.float32, torch.float16]:
print(
name,
param.shape,
param.device,
param.dtype,
param.requires_grad,
param.mean().item(),
param.max().item(),
)
else:
print(name, param.shape, param.device, param.dtype, param.requires_grad)


def arg_parse():
parser = argparse.ArgumentParser(description="Quantize a model with LoftQ.")
parser.add_argument(
"--model_name_or_path",
type=str,
default=None,
required=True,
help="The name or path of the fp32/16 model.",
)
parser.add_argument(
"--token",
type=str,
default=None,
help="The access token to download model from HuggingFace Hub.",
)
parser.add_argument(
"--bits",
type=int,
default=4,
help="The quantized bits",
)
parser.add_argument(
"--iter",
type=int,
default=1,
help="The alternating steps in LoftQ",
)
parser.add_argument(
"--rank",
type=int,
default=16,
help="The rank of the LoRA adapter",
)
parser.add_argument(
"--save_dir",
type=str,
default="./model_zoo/loftq/",
help="The rank of the LoRA adapter",
)
args = parser.parse_args()
return args


def quantize_and_save():
args = arg_parse()

# Download weights and configure LoRA
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True)
if any(name in args.model_name_or_path.lower() for name in ["llama", "mistral", "falcon"]):
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path, token=args.token, trust_remote_code=True, device_map="auto"
)
task_type = TaskType.CAUSAL_LM
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"]

elif any(name in args.model_name_or_path.lower() for name in ["bart", "t5"]):
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path, token=args.token, device_map="auto")
task_type = TaskType.SEQ_2_SEQ_LM
target_modules = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "out_proj"]

elif any(name in args.model_name_or_path.lower() for name in ["deberta", "roberta", "bert"]):
model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, token=args.token)
model = model.cuda()
task_type = TaskType.SEQ_CLS
target_modules = ["query_proj", "key_proj", "value_proj", "dense"] # embeddings not supported by peft
else:
raise NotImplementedError("Other models not supported yet.")

# Config of LoftQ
loftq_config = LoftQConfig(loftq_bits=args.bits, loftq_iter=args.iter)

lora_config = LoraConfig(
task_type=task_type,
inference_mode=True,
r=args.rank,
lora_alpha=16 if task_type is TaskType.CAUSAL_LM else args.rank,
lora_dropout=0.1,
target_modules=target_modules,
init_lora_weights="loftq",
loftq_config=loftq_config,
)

# Obtain LoftQ model
lora_model = get_peft_model(model, lora_config)
base_model = lora_model.get_base_model()

# Save LoftQ model
model_name = args.model_name_or_path.split("/")[-1] + f"-{args.bits}bit" + f"-{args.rank}rank"
base_model_dir = os.path.join(args.save_dir, model_name)
lora_model_dir = os.path.join(args.save_dir, model_name, "loft_init")

# save lora adapters first
lora_model.base_model.peft_config[
"default"
].base_model_name_or_path = base_model_dir # This can be a local path or Hub model id
lora_model.base_model.peft_config["default"].init_lora_weights = True # Don't apply LoftQ when loading again

lora_model.save_pretrained(lora_model_dir)
print_model(lora_model, "lora_model")

# remove lora adapters and save the backbone
unwarap_model(base_model)
base_model.save_pretrained(base_model_dir)
tokenizer.save_pretrained(base_model_dir)

print_model(base_model, "base_model")

return base_model_dir, lora_model_dir


def load_loftq(base_model_path, lora_adapter_path):
if any(name in base_model_path.lower() for name in ["llama", "mistral", "falcon"]):
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
device_map="auto",
low_cpu_mem_usage=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
),
)
elif any(name in base_model_path.lower() for name in ["bart", "t5"]):
model = AutoModelForSeq2SeqLM.from_pretrained(
base_model_path,
device_map="auto",
low_cpu_mem_usage=True,
load_in_4bit=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
),
)
elif any(name in base_model_path.lower() for name in ["deberta", "roberta", "bert"]):
model = AutoModelForSequenceClassification.from_pretrained(
base_model_path,
low_cpu_mem_usage=True,
load_in_4bit=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
),
)
else:
raise NotImplementedError("Other models not supported yet.")

lora_model = PeftModel.from_pretrained(model, lora_adapter_path, is_trainable=True)

# Do training or inference below
print_model(lora_model, "lora_model")
print_model(model, "base_model")


if __name__ == "__main__":
base_dir, lora_dir = quantize_and_save()
load_loftq(base_dir, lora_dir)

# example command:
# python quantize_save_load.py \
# --model_name_or_path meta-llama/Llama-2-7b-hf \
# --token XXX \
# --bits 4 --iter 5 --rank 16 \
# --save_dir ./model_zoo/loftq/
Loading

0 comments on commit 05819db

Please sign in to comment.