From a129a1407301b41fbf110fc4a38aef10fd2cc1df Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 7 Aug 2024 11:34:22 +0800
Subject: [PATCH 01/39] bump version to v0.5.3 (#2242)

---
 README.md                        | 2 +-
 README_ja.md                     | 2 +-
 README_zh-CN.md                  | 2 +-
 docs/en/get_started.md           | 2 +-
 docs/en/multi_modal/cogvlm.md    | 2 +-
 docs/zh_cn/get_started.md        | 2 +-
 docs/zh_cn/multi_modal/cogvlm.md | 2 +-
 lmdeploy/version.py              | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f43d765ed3..1feb2bf71a 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ pip install lmdeploy
 Since v0.3.0, The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/README_ja.md b/README_ja.md
index 4cf67cbd45..9b72b8cde1 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -177,7 +177,7 @@ pip install lmdeploy
 v0.3.0以降、デフォルトのプリビルドパッケージは**CUDA 12**でコンパイルされています。ただし、CUDA 11+が必要な場合は、次のコマンドでlmdeployをインストールできます：
 
 ```shell
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/README_zh-CN.md b/README_zh-CN.md
index c0c3f20f03..5ea1c66ad7 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -177,7 +177,7 @@ pip install lmdeploy
 自 v0.3.0 起，LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
 
 ```shell
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index f390cc5474..c79a295ea1 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -13,7 +13,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md
index ad0045da25..256683bcc3 100644
--- a/docs/en/multi_modal/cogvlm.md
+++ b/docs/en/multi_modal/cogvlm.md
@@ -22,7 +22,7 @@ Install LMDeploy with pip (Python 3.8+). Refer to [Installation](https://lmdeplo
 ```shell
 # cuda 11.8
 # to get the latest version, run: pip index versions lmdeploy
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 # cuda 12.1
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 8577f4834d..32e06b6c41 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -13,7 +13,7 @@ pip install lmdeploy
 LMDeploy的预编译包默认是基于 CUDA 12 编译的。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
 
 ```shell
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/multi_modal/cogvlm.md b/docs/zh_cn/multi_modal/cogvlm.md
index d932a08310..91ae9a3d4a 100644
--- a/docs/zh_cn/multi_modal/cogvlm.md
+++ b/docs/zh_cn/multi_modal/cogvlm.md
@@ -21,7 +21,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https:
 
 ```shell
 # cuda 11.8
-export LMDEPLOY_VERSION=0.5.2
+export LMDEPLOY_VERSION=0.5.3
 export PYTHON_VERSION=38
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 # cuda 12.1
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 28e376f14a..4315131ff6 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.5.2.post1'
+__version__ = '0.5.3'
 short_version = __version__
 
 

From 08cda6dcb6ce43934099294912904c542d350f49 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 7 Aug 2024 15:42:48 +0800
Subject: [PATCH 02/39] Reorganize the user guide and update the get_started
 section (#2038)

* update

* adjust directory structure

* set depth 2

* check in installation.md

* check in installation.md

* update quick start

* update supported platforms

* update supported GPUs

* typo

* update

* update api_server

* update

* format the doc

* fix lint

* update generate.sh

* rollback pipeline.md

* update

* update zh_cn

* update

* fix lint

* fix lint

* fix

* remove build.md

* debug

---------

Co-authored-by: RunningLeon <mnsheng@yeah.net>
---
 README.md                                     |  31 ++-
 README_ja.md                                  |  31 ++-
 README_zh-CN.md                               |  30 ++-
 docs/en/advance/debug_turbomind.md            |   2 +-
 .../en/benchmark/evaluate_with_opencompass.md |   6 +-
 docs/en/benchmark/profile_api_server.md       |   2 +-
 docs/en/build.md                              |  85 --------
 docs/en/conf.py                               |   8 +-
 docs/en/get_started.md                        | 204 ++++++++++++++----
 docs/en/index.rst                             |  56 +++--
 docs/en/installation.md                       |  80 +++++++
 docs/en/{serving => llm}/api_server.md        |  21 +-
 docs/en/{serving => llm}/api_server_tools.md  |   4 +-
 docs/en/{serving => llm}/gradio.md            |   2 +-
 docs/en/{inference => llm}/pipeline.md        |   2 +-
 docs/en/{serving => llm}/proxy_server.md      |   0
 docs/en/{serving => llm}/qos.md               |   0
 .../{serving => multi_modal}/api_server_vl.md |   4 +-
 docs/en/multi_modal/cogvlm.md                 |   2 +-
 docs/en/multi_modal/index.rst                 |  12 ++
 docs/en/multi_modal/internvl.md               |   3 +
 docs/en/multi_modal/llava.md                  |   3 +
 .../{inference => multi_modal}/vl_pipeline.md |   4 +-
 docs/en/quantization/kv_quant.md              |   4 +-
 docs/en/quantization/w4a16.md                 |   6 +-
 docs/en/quantization/w8a8.md                  |   2 +-
 docs/en/supported_models/codellama.md         |   2 +-
 docs/zh_cn/advance/debug_turbomind.md         |   2 +-
 .../benchmark/evaluate_with_opencompass.md    |   6 +-
 docs/zh_cn/benchmark/profile_api_server.md    |   2 +-
 docs/zh_cn/build.md                           |  86 --------
 docs/zh_cn/conf.py                            |   8 +-
 docs/zh_cn/get_started.md                     | 204 +++++++++++++++---
 docs/zh_cn/index.rst                          |  55 +++--
 docs/zh_cn/installation.md                    |  80 +++++++
 docs/zh_cn/{serving => llm}/api_server.md     |   0
 .../{serving => llm}/api_server_tools.md      |   0
 docs/zh_cn/{serving => llm}/gradio.md         |   0
 docs/zh_cn/{inference => llm}/pipeline.md     |   0
 docs/zh_cn/{serving => llm}/proxy_server.md   |   0
 docs/zh_cn/{serving => llm}/qos.md            |   0
 .../{serving => multi_modal}/api_server_vl.md |   2 +-
 docs/zh_cn/multi_modal/index.rst              |  12 ++
 docs/zh_cn/multi_modal/internvl.md            |   3 +
 docs/zh_cn/multi_modal/llava.md               |   3 +
 .../{inference => multi_modal}/vl_pipeline.md |   2 +-
 docs/zh_cn/quantization/w4a16.md              |   4 +-
 docs/zh_cn/supported_models/codellama.md      |   2 +-
 generate.sh                                   |   2 +-
 49 files changed, 663 insertions(+), 416 deletions(-)
 delete mode 100644 docs/en/build.md
 create mode 100644 docs/en/installation.md
 rename docs/en/{serving => llm}/api_server.md (93%)
 rename docs/en/{serving => llm}/api_server_tools.md (99%)
 rename docs/en/{serving => llm}/gradio.md (98%)
 rename docs/en/{inference => llm}/pipeline.md (99%)
 rename docs/en/{serving => llm}/proxy_server.md (100%)
 rename docs/en/{serving => llm}/qos.md (100%)
 rename docs/en/{serving => multi_modal}/api_server_vl.md (97%)
 create mode 100644 docs/en/multi_modal/index.rst
 create mode 100644 docs/en/multi_modal/internvl.md
 create mode 100644 docs/en/multi_modal/llava.md
 rename docs/en/{inference => multi_modal}/vl_pipeline.md (98%)
 delete mode 100644 docs/zh_cn/build.md
 create mode 100644 docs/zh_cn/installation.md
 rename docs/zh_cn/{serving => llm}/api_server.md (100%)
 rename docs/zh_cn/{serving => llm}/api_server_tools.md (100%)
 rename docs/zh_cn/{serving => llm}/gradio.md (100%)
 rename docs/zh_cn/{inference => llm}/pipeline.md (100%)
 rename docs/zh_cn/{serving => llm}/proxy_server.md (100%)
 rename docs/zh_cn/{serving => llm}/qos.md (100%)
 rename docs/zh_cn/{serving => multi_modal}/api_server_vl.md (99%)
 create mode 100644 docs/zh_cn/multi_modal/index.rst
 create mode 100644 docs/zh_cn/multi_modal/internvl.md
 create mode 100644 docs/zh_cn/multi_modal/llava.md
 rename docs/zh_cn/{inference => multi_modal}/vl_pipeline.md (99%)

diff --git a/README.md b/README.md
index 1feb2bf71a..4a6ecdf51b 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
 - \[2024/07\] 🎉🎉 Support Llama3.1 8B, 70B and its TOOLS CALLING
-- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
 - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2
@@ -39,8 +39,8 @@ ______________________________________________________________________
 - \[2024/03\] Support DeepSeek-VL offline inference pipeline and serving.
 - \[2024/03\] Support VLM offline inference pipeline and serving.
 - \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.
-- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md).
-- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md)
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](docs/en/llm/api_server.md).
+- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](docs/en/llm/proxy_server.md)
 - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable  rapid experimentation with new features and technologies.
 
 </details>
@@ -167,19 +167,16 @@ They differ in the types of supported models and the inference data type. Please
 
 ## Installation
 
-Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
+It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12):
 
 ```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
 pip install lmdeploy
 ```
 
-Since v0.3.0, The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:
-
-```shell
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-```
+The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
+For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md).
 
 ## Offline Batch Inference
 
@@ -195,7 +192,7 @@ print(response)
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
 
-For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
+For more information about inference pipeline, please refer to [here](docs/en/llm/pipeline.md).
 
 # Tutorials
 
@@ -204,10 +201,10 @@ Please review [getting_started](./docs/en/get_started.md) section for the basic
 For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):
 
 - User Guide
-  - [LLM Inference pipeline](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
-  - [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
-  - [LLM Serving](docs/en/serving/api_server.md)
-  - [VLM Serving](docs/en/serving/api_server_vl.md)
+  - [LLM Inference pipeline](docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
+  - [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
+  - [LLM Serving](docs/en/llm/api_server.md)
+  - [VLM Serving](docs/en/multi_modal/api_server_vl.md)
   - [Quantization](docs/en/quantization)
 - Advance Guide
   - [Inference Engine - TurboMind](docs/en/inference/turbomind.md)
@@ -216,7 +213,7 @@ For detailed user guides and advanced guides, please refer to our [tutorials](ht
   - [Add a new model](docs/en/advance/pytorch_new_model.md)
   - gemm tuning
   - [Long context inference](docs/en/advance/long_context.md)
-  - [Multi-model inference service](docs/en/serving/proxy_server.md)
+  - [Multi-model inference service](docs/en/llm/proxy_server.md)
 
 # Third-party projects
 
diff --git a/README_ja.md b/README_ja.md
index 9b72b8cde1..62b77e2149 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeployは[modelscope/swift](https://github.com/modelscope/swift)に統合され、VLMs推論のデフォルトアクセラレータとなりました
 - \[2024/07\] 🎉🎉 Llama3.1 8B、70Bおよびそのツールコールをサポート
-- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデル、[InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md)およびInternLM2.5の[ファンクションコール](docs/en/serving/api_server_tools.md)をサポート
+- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデル、[InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md)およびInternLM2.5の[ファンクションコール](docs/en/llm/api_server_tools.md)をサポート
 - \[2024/06\] PyTorchエンジンはDeepSeek-V2およびいくつかのVLMs、例えばCogVLM2、Mini-InternVL、LlaVA-Nextをサポート
 - \[2024/05\] 複数のGPUでVLMsをデプロイする際にビジョンモデルをバランスさせる
 - \[2024/05\] InternVL v1.5、LLaVa、InternLMXComposer2などのVLMsで4ビットの重みのみの量子化と推論をサポート
@@ -39,8 +39,8 @@ ______________________________________________________________________
 - \[2024/03\] DeepSeek-VLのオフライン推論パイプラインとサービングをサポート
 - \[2024/03\] VLMのオフライン推論パイプラインとサービングをサポート
 - \[2024/02\] Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOEなどをサポート
-- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE)が[LMDeployサービングサービス](./docs/en/serving/api_server.md)とシームレスに統合されました
-- \[2024/01\] 複数モデル、複数マシン、複数カードの推論サービスをサポート。使用方法は[こちら](./docs/en/serving/proxy_server.md)を参照してください
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE)が[LMDeployサービングサービス](./docs/en/llm/api_server.md)とシームレスに統合されました
+- \[2024/01\] 複数モデル、複数マシン、複数カードの推論サービスをサポート。使用方法は[こちら](./docs/en/llm/proxy_server.md)を参照してください
 - \[2024/01\] [PyTorch推論エンジン](./docs/en/inference/pytorch.md)をサポートし、完全にPythonで開発されており、開発者の障壁を下げ、新機能や技術の迅速な実験を可能にします
 
 </details>
@@ -168,19 +168,16 @@ LMDeployは、[TurboMind](./docs/en/inference/turbomind.md)および[PyTorch](./
 
 ## インストール
 
-pip（python 3.8+）を使用してlmdeployをインストールするか、[ソースからインストール](./docs/en/build.md)します
+クリーンなconda環境（Python 3.8 - 3.12）でlmdeployをインストールすることをお勧めします。
 
 ```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
 pip install lmdeploy
 ```
 
-v0.3.0以降、デフォルトのプリビルドパッケージは**CUDA 12**でコンパイルされています。ただし、CUDA 11+が必要な場合は、次のコマンドでlmdeployをインストールできます：
-
-```shell
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-```
+v0.3.0から、デフォルトの事前構築済みパッケージはCUDA 12でコンパイルされています。
+CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/installation.md)参照してください。
 
 ## オフラインバッチ推論
 
@@ -196,7 +193,7 @@ print(response)
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
 
-推論パイプラインに関する詳細情報は[こちら](./docs/en/inference/pipeline.md)を参照してください。
+推論パイプラインに関する詳細情報は[こちら](./docs/en/llm/pipeline.md)を参照してください。
 
 # チュートリアル
 
@@ -205,10 +202,10 @@ LMDeployの基本的な使用方法については、[getting_started](./docs/en
 詳細なユーザーガイドと高度なガイドについては、[チュートリアル](https://lmdeploy.readthedocs.io/en/latest/)を参照してください：
 
 - ユーザーガイド
-  - [LLM推論パイプライン](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
-  - [VLM推論パイプライン](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
-  - [LLMサービング](docs/en/serving/api_server.md)
-  - [VLMサービング](docs/en/serving/api_server_vl.md)
+  - [LLM推論パイプライン](./docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
+  - [VLM推論パイプライン](./docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
+  - [LLMサービング](docs/en/llm/api_server.md)
+  - [VLMサービング](docs/en/multi_modal/api_server_vl.md)
   - [量子化](docs/en/quantization)
 - 高度なガイド
   - [推論エンジン - TurboMind](docs/en/inference/turbomind.md)
@@ -217,7 +214,7 @@ LMDeployの基本的な使用方法については、[getting_started](./docs/en
   - [新しいモデルの追加](docs/en/advance/pytorch_new_model.md)
   - gemmチューニング
   - [長文推論](docs/en/advance/long_context.md)
-  - [マルチモデル推論サービス](docs/en/serving/proxy_server.md)
+  - [マルチモデル推論サービス](docs/en/llm/proxy_server.md)
 
 # サードパーティプロジェクト
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5ea1c66ad7..b7d5634fa5 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift)，成为 VLMs 推理的默认加速引擎
 - \[2024/07\] 🎉🎉 支持 Llama3.1 8B 和 70B 模型，以及工具调用功能
-- \[2024/07\] 支持 [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) 全系列模型，[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/serving/api_server_tools.md)
+- \[2024/07\] 支持 [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) 全系列模型，[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md)
 - \[2024/06\] PyTorch engine 支持了 DeepSeek-V2 和若干 VLM 模型推理, 比如 CogVLM2，Mini-InternVL，LlaVA-Next
 - \[2024/05\] 在多 GPU 上部署 VLM 模型时，支持把视觉部分的模型均分到多卡上
 - \[2024/05\] 支持InternVL v1.5, LLaVa, InternLMXComposer2 等 VLMs 模型的 4bit 权重量化和推理
@@ -39,8 +39,8 @@ ______________________________________________________________________
 - \[2024/03\] 支持 DeepSeek-VL 的离线推理 pipeline 和推理服务
 - \[2024/03\] 支持视觉-语言模型（VLM）的离线推理 pipeline 和推理服务
 - \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型
-- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布，支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/api_server.md)
-- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md)
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布，支持无缝接入[LMDeploy Serving Service](docs/zh_cn/llm/api_server.md)
+- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](docs/zh_cn/llm/proxy_server.md)
 - \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md)，作为 TurboMind 引擎的补充。帮助降低开发门槛，和快速实验新特性、新技术
 
 </details>
@@ -168,19 +168,15 @@ LMDeploy 支持 2 种推理引擎： [TurboMind](./docs/zh_cn/inference/turbomin
 
 ## 安装
 
-使用 pip ( python 3.8+) 安装 LMDeploy，或者[源码安装](./docs/zh_cn/build.md)
+我们推荐在一个干净的conda环境下（python3.8 - 3.12），安装 lmdeploy：
 
 ```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
 pip install lmdeploy
 ```
 
-自 v0.3.0 起，LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
-
-```shell
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-```
+自 v0.3.0 起，LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，或者源码安装 LMDeploy，请参考[安装文档](./docs/zh_cn/installation.md)
 
 ## 离线批处理
 
@@ -196,7 +192,7 @@ print(response)
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
 
-关于 pipeline 的更多推理参数说明，请参考[这里](./docs/zh_cn/inference/pipeline.md)
+关于 pipeline 的更多推理参数说明，请参考[这里](docs/zh_cn/llm/pipeline.md)
 
 # 用户教程
 
@@ -205,10 +201,10 @@ print(response)
 为了帮助用户更进一步了解 LMDeploy，我们准备了用户指南和进阶指南，请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/)：
 
 - 用户指南
-  - [LLM 推理 pipeline](./docs/zh_cn/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
-  - [VLM 推理 pipeline](./docs/zh_cn/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
-  - [LLM 推理服务](./docs/zh_cn/serving/api_server.md)
-  - [VLM 推理服务](./docs/zh_cn/serving/api_server_vl.md)
+  - [LLM 推理 pipeline](docs/zh_cn/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
+  - [VLM 推理 pipeline](docs/zh_cn/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
+  - [LLM 推理服务](docs/zh_cn/llm/api_server.md)
+  - [VLM 推理服务](docs/zh_cn/multi_modal/api_server_vl.md)
   - [模型量化](./docs/zh_cn/quantization)
 - 进阶指南
   - [推理引擎 - TurboMind](./docs/zh_cn/inference/turbomind.md)
@@ -217,7 +213,7 @@ print(response)
   - [支持新模型](./docs/zh_cn/advance/pytorch_new_model.md)
   - gemm tuning
   - [长文本推理](./docs/zh_cn/advance/long_context.md)
-  - [多模型推理服务](./docs/zh_cn/serving/proxy_server.md)
+  - [多模型推理服务](docs/zh_cn/llm/proxy_server.md)
 
 # 社区项目
 
diff --git a/docs/en/advance/debug_turbomind.md b/docs/en/advance/debug_turbomind.md
index 5af559a270..c4c7b32f7f 100644
--- a/docs/en/advance/debug_turbomind.md
+++ b/docs/en/advance/debug_turbomind.md
@@ -4,7 +4,7 @@ Turbomind is implemented in C++, which is not as easy to debug as Python. This d
 
 ## Prerequisite
 
-First, complete the local compilation according to the commands in [Build in localhost](../build.md).
+First, complete the local compilation according to the commands in [Install from source](../installation.md).
 
 ## Configure Python debug environment
 
diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md
index 9971b9cc61..f078c6e448 100644
--- a/docs/en/benchmark/evaluate_with_opencompass.md
+++ b/docs/en/benchmark/evaluate_with_opencompass.md
@@ -8,11 +8,7 @@ In this part, we are going to setup the environment for evaluation.
 
 ### Install lmdeploy
 
-Install lmdeploy through pip (python 3.8+). If you want to install from source, you can refer to [build.md](../build.md).
-
-```shell
-pip install lmdeploy
-```
+Please follow the [installation guide](../installation.md) to install lmdeploy.
 
 ### Install OpenCompass
 
diff --git a/docs/en/benchmark/profile_api_server.md b/docs/en/benchmark/profile_api_server.md
index 456ee308ae..07dfc49007 100644
--- a/docs/en/benchmark/profile_api_server.md
+++ b/docs/en/benchmark/profile_api_server.md
@@ -41,7 +41,7 @@ In this section, we take [internlm/internlm-7b](https://huggingface.co/internlm/
 lmdeploy serve api_server internlm/internlm-7b
 ```
 
-If you would like to change the server's port or other parameters, such as inference engine, max batch size and etc., please run `lmdeploy serve api_server -h` or read [this](../serving/api_server.md) guide to get the detailed explanation.
+If you would like to change the server's port or other parameters, such as inference engine, max batch size and etc., please run `lmdeploy serve api_server -h` or read [this](../llm/api_server.md) guide to get the detailed explanation.
 
 ### Profile
 
diff --git a/docs/en/build.md b/docs/en/build.md
deleted file mode 100644
index 51c660b30e..0000000000
--- a/docs/en/build.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Build from source
-
-LMDeploy provides prebuilt package that can be easily installed by `pip install lmdeploy`.
-
-If you have requests to build lmdeploy from source, please clone lmdeploy repository from GitHub, and follow instructions in next sections
-
-```shell
-git clone --depth=1 https://github.com/InternLM/lmdeploy
-```
-
-## Build in Docker (recommended)
-
-We highly advise using the provided docker image for lmdeploy build to circumvent complex environment setup.
-
-The docker image is `openmmlab/lmdeploy-builder:cuda11.8`. Make sure that docker is installed before using this image.
-
-In the root directory of the lmdeploy source code, please run the following command:
-
-```shell
-# the home folder of lmdeploy source code
-cd lmdeploy
-bash builder/manywheel/build_all_wheel.sh
-```
-
-All the wheel files for lmdeploy under py3.8 - py3.11 will be found in the `builder/manywheel/cuda11.8_dist` directory, such as,
-
-```text
-builder/manywheel/cuda11.8_dist/
-├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl
-├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl
-├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl
-└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl
-```
-
-If the wheel file for a specific Python version is required, such as py3.8, please execute:
-
-```shell
-bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist
-```
-
-And the wheel file will be found in the `builder/manywheel/cuda11.8_dist` directory.
-
-You can use `pip install` to install the wheel file that matches the Python version on your host machine.
-
-## Build in localhost (optional)
-
-Firstly, please make sure gcc version is no less than 9, which can be conformed by `gcc --version`.
-
-Then, follow the steps below to set up the compilation environment:
-
-- install the dependent packages:
-  ```shell
-  pip install -r requirements.txt
-  apt-get install rapidjson-dev
-  ```
-- install [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), and set environment variables:
-  ```shell
-  export NCCL_ROOT_DIR=/path/to/nccl
-  export NCCL_LIBRARIES=/path/to/nccl/lib
-  ```
-- install openmpi from source:
-  ```shell
-  wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
-  tar xf openmpi-4.1.5.tar.gz
-  cd openmpi-4.1.5
-  ./configure --prefix=/usr/local/openmpi
-  make -j$(nproc) && make install
-  export PATH=$PATH:/usr/local/openmpi/bin
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
-  ```
-- build and install lmdeploy libraries:
-  ```shell
-  # install ninja
-  apt install ninja-build
-  # the home folder of lmdeploy
-  cd lmdeploy
-  mkdir build && cd build
-  sh ../generate.sh
-  ninja -j$(nproc) && ninja install
-  ```
-- install lmdeploy python package:
-  ```shell
-  cd ..
-  pip install -e .
-  ```
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 18a1b7d1d2..c24e6ab6f0 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -106,16 +106,16 @@
     'path_to_docs': 'docs/en',
     'repository_url': 'https://github.com/InternLM/lmdeploy',
     'repository_branch': 'main',
-    'show_navbar_depth': 3,
-    'max_navbar_depth': 4,
-    'collapse_navbar': True,
+    # 'show_navbar_depth': 3,
+    # 'navigation_depth': 4,
+    # 'collapse_navigation': False,
     'use_edit_page_button': True,
     'use_source_button': True,
     'use_issues_button': True,
     'use_repository_button': True,
     'use_download_button': True,
     'use_sidenotes': True,
-    'show_toc_level': 2,
+    # 'show_toc_level': 2,
     # "icon_links": [
     #     {
     #         "name": "切换至简体中文",
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index c79a295ea1..76045aebab 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -1,69 +1,201 @@
-# Get Started
+# Quick Start
 
-LMDeploy offers functionalities such as model quantization, offline batch inference, online serving, etc. Each function can be completed with just a few simple lines of code or commands.
+This tutorial shows the usage of LMDeploy on:
 
-## Installation
+- Offline inference of LLM model and VLM model
+- Serve a LLM or VLM model by the OpenAI compatible server
+- Console CLI to interactively chat with LLM model
 
-Install lmdeploy with pip (python 3.8+) or [from source](./build.md)
+Before reading further, please ensure that you have installed lmdeploy as outlined in the [installation guide](installation.md)
 
-```shell
-pip install lmdeploy
+## Offline batch inference
+
+### LLM inference
+
+```python
+from lmdeploy import pipeline
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
+print(response)
 ```
 
-The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:
+When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](supported_models/supported_models.md), with the TurboMind Engine taking precedence by default.
 
-```shell
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+However, you have the option to manually select an engine. For instance,
+
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+pipe = pipeline('internlm/internlm2_5-7b-chat',
+                backend_config=TurbomindEngineConfig(
+                    max_batch_size=32,
+                    enable_prefix_caching=True,
+                    cache_max_entry_count=0.8,
+                    session_len=8192,
+                ))
 ```
 
-## Offline batch inference
+or,
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+pipe = pipeline('internlm/internlm2_5-7b-chat',
+                backend_config=PytorchEngineConfig(
+                    max_batch_size=32,
+                    enable_prefix_caching=True,
+                    cache_max_entry_count=0.8,
+                    session_len=8192,
+                ))
+```
+
+```{note}
+The parameter "cache_max_entry_count" significantly influences the GPU memory usage. It means the proportion of FREE GPU memory occupied by the K/V cache after the model weights are loaded.
+The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline and the api_server mentioned later in the next consumes a substantial amount of GPU memory.
+If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of "cache_max_entry_count".
+```
+
+When use the callable `pipe()` to perform token generation with given prompts, you can set the sampling parameters via `GenerationConfig` as below:
+
+```python
+from lmdeploy import GenerationConfig, pipeline
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+prompts = ['Hi, pls intro yourself', 'Shanghai is']
+response = pipe(prompts,
+                gen_config=GenerationConfig(
+                    max_new_tokens=1024,
+                    top_p=0.8,
+                    top_k=40,
+                    temperature=0.6
+                ))
+```
+
+In the `GenerationConfig`, `top_k=1` or `temperature=0.0` indicates greedy search.
+
+For more information about pipeline, please read the [detailed tutorial](llm/pipeline.md)
+
+### VLM inference
+
+The usage of VLM inference pipeline is akin to that of LLMs, with the additional capability of processing image data with the pipeline.
+For example, you can utilize the following code snippet to perform the inference with an InternVL model:
 
 ```python
-import lmdeploy
-pipe = lmdeploy.pipeline("internlm/internlm2_5-7b-chat")
-response = pipe(["Hi, pls intro yourself", "Shanghai is"])
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('OpenGVLab/InternVL2-8B')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+In VLM pipeline, the default image processing batch size is 1. This can be adjusted by `VisionConfig`. For instance, you might set it like this:
+
+```python
+from lmdeploy import pipeline, VisionConfig
+from lmdeploy.vl import load_image
+
+pipe = pipeline('OpenGVLab/InternVL2-8B',
+                vision_config=VisionConfig(
+                    max_batch_size=8
+                ))
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
 print(response)
 ```
 
-For more information on inference pipeline parameters, please refer to [here](./inference/pipeline.md).
+However, the larger the image batch size, the greater risk of an OOM error, because the LLM component within the VLM model pre-allocates a massive amount of memory in advance.
+
+We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models matrix](./supported_models/supported_models.md).
+Additionally, follow the instructions in [LLM Inference](#llm-inference) section to reduce the values of memory-related parameters
 
 ## Serving
 
-LMDeploy offers various serving methods, choosing one that best meet your requirements.
+As demonstrated in the previous [offline batch inference](#offline-batch-inference) section, this part presents the respective serving methods for LLMs and VLMs.
 
-- [Serving with openai compatible server](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html)
-- [Serving with docker](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html#option-2-deploying-with-docker)
-- [Serving with gradio](https://lmdeploy.readthedocs.io/en/latest/serving/gradio.html)
+### Serve a LLM model
 
-## Quantization
+```shell
+lmdeploy serve api_server internlm/internlm2_5-7b-chat
+```
 
-LMDeploy provides the following quantization methods. Please visit the following links for the detailed guide
+This command will launch an OpenAI-compatible server on the localhost at port `23333`. You can specify a different server port by using the `--server-port` option.
+For more options, consult the help documentation by running `lmdeploy serve api_server --help`. Most of these options align with the engine configuration.
 
-- [4bit weight-only quantization](quantization/w4a16.md)
-- [k/v quantization](quantization/kv_quant.md)
-- [w8a8 quantization](quantization/w8a8.md)
+To access the service, you can utilize the official OpenAI Python package `pip install openai`. Below is an example demonstrating how to use the entrypoint `v1/chat/completions`
 
-## Useful Tools
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": " provide three suggestions about time management"},
+  ],
+    temperature=0.8,
+    top_p=0.8
+)
+print(response)
+```
 
-LMDeploy CLI offers the following utilities, helping users experience LLM features conveniently
+We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](./llm/api_server.md), [function calls](llm/api_server_tools.md) and other topics
 
-### Inference with Command line Interface
+### Serve a VLM model
 
 ```shell
-lmdeploy chat internlm/internlm2_5-7b-chat
+lmdeploy serve api_server OpenGVLab/InternVL2-8B
 ```
 
-### Serving with Web UI
+```{note}
+LMDeploy reuses the vision component from upstream VLM repositories. Each upstream VLM model may have different dependencies.
+Consequently, LMDeploy has decided not to include the dependencies of the upstream VLM repositories in its own dependency list.
+If you encounter an "ImportError" when using LMDeploy for inference with VLM models, please install the relevant dependencies yourself.
+```
 
-LMDeploy adopts gradio to develop the online demo.
+After the service is launched successfully, you can access the VLM service in a manner similar to how you would access the `gptv4` service by modifying the `api_key` and `base_url` parameters:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
+
+## Inference with Command line Interface
+
+LMDeploy offers a very convenient CLI tool for users to chat with the LLM model locally. For example:
 
 ```shell
-# install dependencies
-pip install lmdeploy[serve]
-# launch gradio server
-lmdeploy serve gradio internlm/internlm2_5-7b-chat
+lmdeploy chat internlm/internlm2_5-7b-chat --backend turbomind
 ```
 
-![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
+It is designed to assist users in checking and verifying whether LMDeploy supports their model, whether the chat template is applied correctly, and whether the inference results are delivered smoothly.
+
+Another tool, `lmdeploy check_env`, aims to gather the essential environment information. It is crucial when reporting an issue to us, as it helps us diagnose and resolve the problem more effectively.
+
+If you have any doubt about their usage, you can try using the `--help` option to obtain detailed information.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index d79b4b60aa..3842b54f08 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -41,50 +41,35 @@ Documentation
    :maxdepth: 2
    :caption: Get Started
 
+   installation.md
    get_started.md
 
-.. _build:
-.. toctree::
-   :maxdepth: 1
-   :caption: Build
-
-   build.md
-
-.. _benchmark:
-.. toctree::
-   :maxdepth: 1
-   :caption: Benchmark
-
-   benchmark/profile_generation.md
-   benchmark/profile_throughput.md
-   benchmark/profile_api_server.md
-   benchmark/evaluate_with_opencompass.md
-
 .. _supported_models:
 .. toctree::
    :maxdepth: 1
-   :caption: Supported Models
+   :caption: Models
 
    supported_models/supported_models.md
 
-.. _inference:
+.. _llm_deployment:
 .. toctree::
    :maxdepth: 1
-   :caption: Inference
+   :caption: Large Language Models(LLMs) Deployment
 
-   inference/pipeline.md
-   inference/vl_pipeline.md
+   llm/pipeline.md
+   llm/api_server.md
+   llm/api_server_tools.md
+   llm/gradio.md
+   llm/proxy_server.md
 
-.. _serving:
+.. _vlm_deployment:
 .. toctree::
    :maxdepth: 1
-   :caption: Serving
+   :caption: Vision-Language Models(VLMs) Deployment
 
-   serving/api_server.md
-   serving/api_server_vl.md
-   serving/api_server_tools.md
-   serving/gradio.md
-   serving/proxy_server.md
+   multi_modal/vl_pipeline.md
+   multi_modal/api_server_vl.md
+   multi_modal/index.rst
 
 .. _quantization:
 .. toctree::
@@ -92,8 +77,18 @@ Documentation
    :caption: Quantization
 
    quantization/w4a16.md
-   quantization/kv_quant.md
    quantization/w8a8.md
+   quantization/kv_quant.md
+
+.. _benchmark:
+.. toctree::
+   :maxdepth: 1
+   :caption: Benchmark
+
+   benchmark/profile_generation.md
+   benchmark/profile_throughput.md
+   benchmark/profile_api_server.md
+   benchmark/evaluate_with_opencompass.md
 
 .. toctree::
    :maxdepth: 1
@@ -105,7 +100,6 @@ Documentation
    advance/long_context.md
    advance/chat_template.md
    advance/debug_turbomind.md
-   serving/qos.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/en/installation.md b/docs/en/installation.md
new file mode 100644
index 0000000000..d1333f45ab
--- /dev/null
+++ b/docs/en/installation.md
@@ -0,0 +1,80 @@
+# Installation
+
+LMDeploy is a python library for compressing, deploying, and serving Large Language Models(LLMs) and Vision-Language Models(VLMs).
+Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers.
+
+It supports LLMs and VLMs deployment on both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs:
+
+- Volta(sm70): V100
+- Turing(sm75): 20 series, T4
+- Ampere(sm80,sm86): 30 series, A10, A16, A30, A100
+- Ada Lovelace(sm89): 40 series
+
+## Install with pip (Recommend)
+
+It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12):
+
+```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
+pip install lmdeploy
+```
+
+The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
+
+```shell
+export LMDEPLOY_VERSION=0.5.3
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## Install nightly-build package with pip
+
+The release frequency of LMDeploy is approximately once or twice monthly. If your desired feature has been merged to LMDeploy main branch but hasn't been published yet, you can experiment with the nightly-built package available [here](https://github.com/zhyncs/lmdeploy-build) according to your CUDA and Python versions
+
+## Install from source
+
+If you are using the PyTorch Engine for inference, the installation from the source is quite simple:
+
+```shell
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+pip install -e .
+```
+
+But if you are using the TurboMind Engine, you have to build the source as shown below. The `openmmlab/lmdeploy:{tag}` docker image is strongly recommended.
+
+**Step 1** - Get the docker image of LMDeploy
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+```{note}
+The "openmmlab/lmdeploy:latest" is based on "nvidia/cuda:12.4.1-devel-ubuntu22.04". If you are working on a platform with cuda 11+ driver, please use "openmmlab/lmdeploy:latest-cu11".
+The pattern of the LMDeploy docker image tag is "openmmlab/lmdeploy:{version}-cu(11|12)" since v0.5.3.
+```
+
+**Step 2** - Clone LMDeploy source code and change to its root directory
+
+```shell
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+```
+
+**Step 3** - launch docker container in interactive mode
+
+```shell
+docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name lmdeploy -it openmmlab/lmdeploy:latest bin/bash
+```
+
+**Step 4** - build and installation
+
+```shell
+cd /opt/lmdeploy
+mkdir -p build && cd build
+bash ../generate.sh make
+make -j$(nproc) && make install
+cd ..
+pip install -e .
+```
diff --git a/docs/en/serving/api_server.md b/docs/en/llm/api_server.md
similarity index 93%
rename from docs/en/serving/api_server.md
rename to docs/en/llm/api_server.md
index fbb3891f94..285b0e32ff 100644
--- a/docs/en/serving/api_server.md
+++ b/docs/en/llm/api_server.md
@@ -1,9 +1,9 @@
-# Serving LLM with OpenAI Compatible Server
+# OpenAI Compatible Server
 
 This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API.
-For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md).
+For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](proxy_server.md).
 
-In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario.
+In the following sections, we will first introduce methods for starting the service, choosing the appropriate one based on your application scenario.
 
 Next, we focus on the definition of the service's RESTful API, explore the various ways to interact with the interface, and demonstrate how to try the service through the Swagger UI or LMDeploy CLI tools.
 
@@ -242,10 +242,6 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \
 
 ## Integrate with WebUI
 
-LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to integrate a web ui for `api_server`
-
-### Option 1: gradio
-
 ```shell
 # api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
@@ -253,21 +249,12 @@ LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to
 lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
 ```
 
-### Option 2: OpenAOE
-
-```shell
-pip install -U openaoe
-openaoe -f /path/to/your/config-template.yaml
-```
-
-Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/docs/tech-report/model_serving_by_lmdeploy/model_serving_by_lmdeploy.md) for more deploy information.
-
 ## FAQ
 
 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be
    modified by passing `--session_len` to api_server.
 
-2. When OOM appeared at the server side, please reduce the `cache_max_entry_count` of `backend_config` when lanching the service.
+2. When OOM appeared at the server side, please reduce the `cache_max_entry_count` of `backend_config` when launching the service.
 
 3. When the request with the same `session_id` to `/v1/chat/interactive` got a empty return value and a negative `tokens`, please consider setting `interactive_mode=false` to restart the session.
 
diff --git a/docs/en/serving/api_server_tools.md b/docs/en/llm/api_server_tools.md
similarity index 99%
rename from docs/en/serving/api_server_tools.md
rename to docs/en/llm/api_server_tools.md
index 379d4942b1..0a6b8f7768 100644
--- a/docs/en/serving/api_server_tools.md
+++ b/docs/en/llm/api_server_tools.md
@@ -1,4 +1,4 @@
-# Tools
+# Tools Calling
 
 LMDeploy supports tools for InternLM2, InternLM2.5 and llama3.1 models.
 
@@ -155,7 +155,7 @@ ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logp
 
 Meta announces in [Llama3's official user guide](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1) that,
 
-```{text}
+```{note}
 There are three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt:
 
 1. Brave Search: Tool call to perform web searches.
diff --git a/docs/en/serving/gradio.md b/docs/en/llm/gradio.md
similarity index 98%
rename from docs/en/serving/gradio.md
rename to docs/en/llm/gradio.md
index f4ddee3124..8707d20a82 100644
--- a/docs/en/serving/gradio.md
+++ b/docs/en/llm/gradio.md
@@ -1,4 +1,4 @@
-# Serving with Gradio
+# WebUI Demo
 
 Starting an LLM model's gradio service with LMDeploy and interacting with the model on the WebUI is incredibly simple.
 
diff --git a/docs/en/inference/pipeline.md b/docs/en/llm/pipeline.md
similarity index 99%
rename from docs/en/inference/pipeline.md
rename to docs/en/llm/pipeline.md
index 695a1f4f26..5570673576 100644
--- a/docs/en/inference/pipeline.md
+++ b/docs/en/llm/pipeline.md
@@ -1,4 +1,4 @@
-# LLM Offline Inference Pipeline
+# Offline Inference Pipeline
 
 In this tutorial, We will present a list of examples to introduce the usage of `lmdeploy.pipeline`.
 
diff --git a/docs/en/serving/proxy_server.md b/docs/en/llm/proxy_server.md
similarity index 100%
rename from docs/en/serving/proxy_server.md
rename to docs/en/llm/proxy_server.md
diff --git a/docs/en/serving/qos.md b/docs/en/llm/qos.md
similarity index 100%
rename from docs/en/serving/qos.md
rename to docs/en/llm/qos.md
diff --git a/docs/en/serving/api_server_vl.md b/docs/en/multi_modal/api_server_vl.md
similarity index 97%
rename from docs/en/serving/api_server_vl.md
rename to docs/en/multi_modal/api_server_vl.md
index 390da44c5e..878ddfea9c 100644
--- a/docs/en/serving/api_server_vl.md
+++ b/docs/en/multi_modal/api_server_vl.md
@@ -1,7 +1,7 @@
-# Serving VLM with OpenAI Compatible Server
+# OpenAI Compatible Server
 
 This article primarily discusses the deployment of a single large vision language model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API.
-For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md).
+For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](../llm/proxy_server.md).
 
 In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario.
 
diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md
index 256683bcc3..fcffdeb3c3 100644
--- a/docs/en/multi_modal/cogvlm.md
+++ b/docs/en/multi_modal/cogvlm.md
@@ -1,4 +1,4 @@
-# cogvlm
+# CogVLM
 
 ## Introduction
 
diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
new file mode 100644
index 0000000000..3c6061e776
--- /dev/null
+++ b/docs/en/multi_modal/index.rst
@@ -0,0 +1,12 @@
+Vision-Language Models
+=================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Examples
+
+   llava.md
+   internvl.md
+   xcomposer2d5.md
+   cogvlm.md
+   minicpmv.md
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
new file mode 100644
index 0000000000..0b204cb920
--- /dev/null
+++ b/docs/en/multi_modal/internvl.md
@@ -0,0 +1,3 @@
+# InternVL
+
+TODO
diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
new file mode 100644
index 0000000000..cf95e15d5c
--- /dev/null
+++ b/docs/en/multi_modal/llava.md
@@ -0,0 +1,3 @@
+# LLaVA
+
+TODO
diff --git a/docs/en/inference/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
similarity index 98%
rename from docs/en/inference/vl_pipeline.md
rename to docs/en/multi_modal/vl_pipeline.md
index 047fb37afc..72eb0b4595 100644
--- a/docs/en/inference/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -1,6 +1,6 @@
-# VLM Offline Inference Pipeline
+# Offline Inference Pipeline
 
-LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](./pipeline.md).
+LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llm/pipeline.md).
 
 Currently, it supports the following models.
 
diff --git a/docs/en/quantization/kv_quant.md b/docs/en/quantization/kv_quant.md
index 17b741699c..22a8ec91ae 100644
--- a/docs/en/quantization/kv_quant.md
+++ b/docs/en/quantization/kv_quant.md
@@ -1,10 +1,10 @@
-# Key-Value(KV) Cache Quantization
+# INT4/INT8 KV Cache
 
 Since v0.4.0, LMDeploy has supported **online** key-value (kv) cache quantization with int4 and int8 numerical precision, utilizing an asymmetric quantization method that is applied on a per-head, per-token basis. The original kv offline quantization method has been removed.
 
 Intuitively, quantization is beneficial for increasing the number of kv block. Compared to fp16, the number of kv block for int4/int8 kv can be increased by 4 times and 2 times respectively. This means that under the same memory conditions, the system can support a significantly increased number of concurrent operations after kv quantization, thereby ultimately enhancing throughput.
 
-However, quantization typically brings in some loss of model accuracy. We have used OpenCompass to evaluate the accuracy of several models after applying int4/int8 quantization. int8 kv keeps the accuracy while int4 kv has slight loss. The detailed results are presented in the [Evaluation](#Evaluation) section. You can refer to the information and choose wisely based on your requirements.
+However, quantization typically brings in some loss of model accuracy. We have used OpenCompass to evaluate the accuracy of several models after applying int4/int8 quantization. int8 kv keeps the accuracy while int4 kv has slight loss. The detailed results are presented in the [Evaluation](#evaluation) section. You can refer to the information and choose wisely based on your requirements.
 
 LMDeploy inference with quantized kv supports the following NVIDIA GPU models:
 
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 46b5b26a24..7b0306c9ec 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -1,4 +1,4 @@
-# W4A16 Quantization
+# AWQ
 
 LMDeploy adopts [AWQ](https://arxiv.org/abs/2306.00978) algorithm for 4bit weight-only quantization. By developed the high-performance cuda kernel, the 4bit quantized model inference achieves up to 2.4x faster than FP16.
 
@@ -88,7 +88,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
-For more information about the pipeline parameters, please refer to [here](../inference/pipeline.md).
+For more information about the pipeline parameters, please refer to [here](../llm/pipeline.md).
 
 In addition to performing inference with the quantized model on localhost, LMDeploy can also execute inference for the 4bit quantized model derived from AWQ algorithm available on Huggingface Hub, such as models from the [lmdeploy space](https://huggingface.co/lmdeploy) and [TheBloke space](https://huggingface.co/TheBloke)
 
@@ -124,7 +124,7 @@ The default port of `api_server` is `23333`. After the server is launched, you c
 lmdeploy serve api_client http://0.0.0.0:23333
 ```
 
-You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can also read the API specification from [here](../serving/api_server.md).
+You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can also read the API specification from [here](../llm/api_server.md).
 
 ## Performance
 
diff --git a/docs/en/quantization/w8a8.md b/docs/en/quantization/w8a8.md
index 12e873b4f1..1b1726bd5f 100644
--- a/docs/en/quantization/w8a8.md
+++ b/docs/en/quantization/w8a8.md
@@ -1,4 +1,4 @@
-# W8A8 LLM Model Deployment
+# SmoothQuant
 
 LMDeploy provides functions for quantization and inference of large language models using 8-bit integers.
 
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
index 9bbd7ba3a5..5ef5bfa691 100644
--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -108,4 +108,4 @@ or through webui after launching gradio,
 lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
 ```
 
-Regarding the detailed information of RESTful API, you can refer to the [guide](../serving/api_server.md).
+Regarding the detailed information of RESTful API, you can refer to the [guide](../llm/api_server.md).
diff --git a/docs/zh_cn/advance/debug_turbomind.md b/docs/zh_cn/advance/debug_turbomind.md
index 190600c1f1..cb95c6ef4d 100644
--- a/docs/zh_cn/advance/debug_turbomind.md
+++ b/docs/zh_cn/advance/debug_turbomind.md
@@ -4,7 +4,7 @@ Turbomind 使用 C++ 实现，不像 Python 一样易于调试。该文档提供
 
 ## 前置工作
 
-首先，根据构建[命令](../build.md)完成本地编译。
+首先，根据构建[命令](../installation.md)完成源码编译和安装。
 
 ## 配置 Python 调试环境
 
diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
index d12a82f113..d45c8b28a0 100644
--- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md
+++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
@@ -8,11 +8,7 @@ LMDeploy设计了TurboMind推理引擎用来加速大模型推理，其推理精
 
 ### 安装 lmdeploy
 
-使用 pip (python 3.8+) 安装 LMDeploy，或者[源码安装](../build.md)
-
-```shell
-pip install lmdeploy
-```
+请参考[安装指南](../installation.md)安装 lmdeploy
 
 ### 安装 OpenCompass
 
diff --git a/docs/zh_cn/benchmark/profile_api_server.md b/docs/zh_cn/benchmark/profile_api_server.md
index 01c6fa35c4..c872820040 100644
--- a/docs/zh_cn/benchmark/profile_api_server.md
+++ b/docs/zh_cn/benchmark/profile_api_server.md
@@ -41,7 +41,7 @@ $$
 lmdeploy serve api_server internlm/internlm-7b
 ```
 
-如果你想改变 server 的端口，或者诸如推理引擎、最大批处理值等参数，请运行 `lmdeploy serve api_server -h` 或者阅读[这篇文档](../serving/api_server.md)，查看详细的参数说明。
+如果你想改变 server 的端口，或者诸如推理引擎、最大批处理值等参数，请运行 `lmdeploy serve api_server -h` 或者阅读[这篇文档](../llm/api_server.md)，查看详细的参数说明。
 
 ### 测速
 
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
deleted file mode 100644
index 48145ec0af..0000000000
--- a/docs/zh_cn/build.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# 编译和安装
-
-LMDeploy 提供了预编译包，可以很方便的通过 `pip install lmdeploy` 安装和使用。
-
-如果有源码编译的需求，请先下载 lmdeploy 源码：
-
-```shell
-git clone --depth=1 https://github.com/InternLM/lmdeploy
-```
-
-然后，参考以下章节编译和安装。
-
-## 在 docker 内编译安装（强烈推荐）
-
-LMDeploy 提供了编译镜像 `openmmlab/lmdeploy-builder:cuda11.8`。使用之前，请确保 docker 已安装。
-
-在 lmdeploy 源码的根目录下，运行以下命令：
-
-```shell
-# lmdeploy 源码根目录
-cd lmdeploy
-bash builder/manywheel/build_all_wheel.sh
-```
-
-即可在 `builder/manywheel/cuda11.8_dist` 文件夹下，得到 lmdeploy 在 py3.8 - py3.11 下所有的 wheel 文件。比如，
-
-```text
-builder/manywheel/cuda11.8_dist/
-├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl
-├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl
-├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl
-└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl
-```
-
-如果需要固定 python 版本的 wheel 文件，比如 py3.8，可以执行：
-
-```shell
-bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist
-```
-
-wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
-
-在宿主机上，通过 `pip install` 安装和宿主机python版本一致的 wheel 文件，即完成 lmdeploy 整个编译安装过程。
-
-## 在物理机上编译安装（可选）
-
-首先，请确保物理机环境的 gcc 版本不低于 9，可以通过`gcc --version`确认。
-
-然后，按如下步骤，配置编译环境：
-
-- 安装编译和运行依赖包：
-  ```shell
-  pip install -r requirements.txt
-  apt-get install rapidjson-dev
-  ```
-- 安装 [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html),设置环境变量
-  ```shell
-  export NCCL_ROOT_DIR=/path/to/nccl
-  export NCCL_LIBRARIES=/path/to/nccl/lib
-  ```
-- 源码编译安装 openmpi:
-  ```shell
-  wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
-  tar xf openmpi-4.1.5.tar.gz
-  cd openmpi-4.1.5
-  ./configure --prefix=/usr/local/openmpi
-  make -j$(nproc) && make install
-  export PATH=$PATH:/usr/local/openmpi/bin
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
-  ```
-- lmdeploy 编译安装:
-  ```shell
-  # 安装更快的 Ninja
-  apt install ninja-build
-  # lmdeploy 源码的根目录
-  cd lmdeploy
-  mkdir build && cd build
-  sh ../generate.sh
-  ninja && ninja install
-  ninja -j$(nproc) && ninja install
-  ```
-- 安装 lmdeploy python package:
-  ```shell
-  cd ..
-  pip install -e .
-  ```
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 21e96f2dc4..6804c626c3 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -107,16 +107,16 @@
     'path_to_docs': 'docs/zh_cn',
     'repository_url': 'https://github.com/InternLM/lmdeploy',
     'repository_branch': 'main',
-    'show_navbar_depth': 3,
-    'max_navbar_depth': 4,
-    'collapse_navbar': True,
+    # 'show_navbar_depth': 3,
+    # 'navigation_depth': 4,
+    # 'collapse_navigation': True,
     'use_edit_page_button': True,
     'use_source_button': True,
     'use_issues_button': True,
     'use_repository_button': True,
     'use_download_button': True,
     'use_sidenotes': True,
-    'show_toc_level': 2,
+    # 'show_toc_level': 2,
     # "icon_links": [
     #     {
     #         "name": "Switch to English",
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 32e06b6c41..5649397a8f 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -1,25 +1,19 @@
-# 快速上手
+# 快速开始
 
 LMDeploy提供了快速安装、模型量化、离线批处理、在线推理服务等功能。每个功能只需简单的几行代码或者命令就可以完成。
 
-## 安装
+本教程将展示 LMDeploy 在以下几方面的使用方法：
 
-使用 pip (python 3.8+) 安装 LMDeploy，或者[源码安装](./build.md)
+- LLM 模型和 VLM 模型的离线推理
+- 搭建与 OpenAI 接口兼容的 LLM 或 VLM 模型服务
+- 通过控制台命令行与 LLM 模型进行交互式聊天
 
-```shell
-pip install lmdeploy
-```
-
-LMDeploy的预编译包默认是基于 CUDA 12 编译的。如果需要在 CUDA 11+ 下安装 LMDeploy，请执行以下命令：
-
-```shell
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-```
+在继续阅读之前，请确保你已经按照[安装指南](installation.md)安装了 lmdeploy。
 
 ## 离线批处理
 
+### LLM 推理
+
 ```python
 import lmdeploy
 pipe = lmdeploy.pipeline("internlm/internlm2_5-7b-chat")
@@ -27,41 +21,181 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
-有关 pipeline 的详细使用说明，请参考[这里](./inference/pipeline.md)
+在构造 `pipeline` 时，如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理，LMDeploy 将根据[它们各自的能力](supported_models/supported_models.md)自动分配一个，默认优先使用 TurboMind 引擎。
+
+然而，你可以选择手动选择一个引擎。例如，
+
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+pipe = pipeline('internlm/internlm2_5-7b-chat',
+                backend_config=TurbomindEngineConfig(
+                    max_batch_size=32,
+                    enable_prefix_caching=True,
+                    cache_max_entry_count=0.8,
+                    session_len=8192,
+                ))
+```
+
+或者，
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+pipe = pipeline('internlm/internlm2_5-7b-chat',
+                backend_config=PytorchEngineConfig(
+                    max_batch_size=32,
+                    enable_prefix_caching=True,
+                    cache_max_entry_count=0.8,
+                    session_len=8192,
+                ))
+```
+
+```{note}
+参数 "cache_max_entry_count" 显著影响 GPU 内存占用。它表示加载模型权重后 K/V 缓存占用的空闲 GPU 内存的比例。
+默认值是 0.8。K/V 缓存分配方式是一次性申请，重复性使用，这就是为什么 pipeline 以及下文中的 api_server 在启动后会消耗大量 GPU 内存。
+如果你遇到内存不足(OOM)错误的错误，可能需要考虑降低 cache_max_entry_count 的值。
+```
+
+当使用 `pipe()` 生成提示词的 token 时，你可以通过 `GenerationConfig` 设置采样参数，如下所示：
+
+```python
+from lmdeploy import GenerationConfig, pipeline
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+prompts = ['Hi, pls intro yourself', 'Shanghai is']
+response = pipe(prompts,
+                gen_config=GenerationConfig(
+                    max_new_tokens=1024,
+                    top_p=0.8,
+                    top_k=40,
+                    temperature=0.6
+                ))
+```
+
+在 `GenerationConfig` 中，`top_k=1` 或 `temperature=0.0` 表示贪心搜索。
 
-## 推理服务
+有关 pipeline 的更多信息，请参考[这里](llm/pipeline.md)
 
-LMDeploy 提供了多种部署模型推理服务的方式，总有一款适合你。
+### VLM 推理
 
-- [部署类 openai 的服务](https://lmdeploy.readthedocs.io/zh-cn/latest//serving/api_server.html)
-- [通过 docker 部署服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/api_server.html#docker)
-- [部署 gradio 服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/gradio.html)
+VLM 推理 pipeline 与 LLM 类似，但增加了使用 pipeline 处理图像数据的能力。例如，你可以使用以下代码片段对 InternVL 模型进行推理：
 
-## 模型量化
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
 
-- [INT4 权重量化](quantization/w4a16.md)
-- [K/V 量化](quantization/kv_quant.md)
-- [W8A8 量化](quantization/w8a8.md)
+pipe = pipeline('OpenGVLab/InternVL2-8B')
 
-## 好用的工具
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
 
-LMDeploy CLI 提供了如下便捷的工具，方便用户快速体验模型对话效果
+在 VLM pipeline 中，默认的图像处理批量大小是 1。这可以通过 `VisionConfig` 调整。例如，你可以这样设置：
 
-### 控制台交互式对话
+```python
+from lmdeploy import pipeline, VisionConfig
+from lmdeploy.vl import load_image
+
+pipe = pipeline('OpenGVLab/InternVL2-8B',
+                vision_config=VisionConfig(
+                    max_batch_size=8
+                ))
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+然而，图像批量大小越大，OOM 错误的风险越大，因为 VLM 模型中的 LLM 部分会提前预分配大量的内存。
+
+VLM pipeline 对于推理引擎的选择方式与 LLM pipeline 类似。你可以参考 [LLM 推理](#llm-推理)并结合两个引擎支持的 VLM 模型列表，手动选择和配置推理引擎。
+
+## 模型服务
+
+类似前文[离线批量推理](#离线批处理)，我们在本章节介绍 LLM 和 VLM 各自构建服务方法。
+
+### LLM 模型服务
 
 ```shell
-lmdeploy chat internlm/internlm2_5-7b-chat
+lmdeploy serve api_server internlm/internlm2_5-7b-chat
 ```
 
-### WebUI 交互式对话
+此命令将在本地主机上的端口 `23333` 启动一个与 OpenAI 接口兼容的模型推理服务。你可以使用 `--server-port` 选项指定不同的服务器端口。
+更多选项，请通过运行 `lmdeploy serve api_server --help` 查阅帮助文档。这些选项大多与引擎配置一致。
 
-LMDeploy 使用 gradio 开发了在线对话 demo。
+要访问服务，你可以使用官方的 OpenAI Python 包 `pip install openai`。以下是演示如何使用入口点 v1/chat/completions 的示例：
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": " provide three suggestions about time management"},
+  ],
+    temperature=0.8,
+    top_p=0.8
+)
+print(response)
+```
+
+我们鼓励你参考详细指南，了解关于[使用 Docker 部署服务](./llm/api_server.md)、[工具调用](llm/api_server_tools.md)和其他更多功能的信息。
+
+### VLM 模型服务
 
 ```shell
-# 安装依赖
-pip install lmdeploy[serve]
-# 启动
-lmdeploy serve gradio internlm/internlm2_5-7b-chat
+lmdeploy serve api_server OpenGVLab/InternVL2-8B
 ```
 
-![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
+```{note}
+LMDeploy 复用了上游 VLM 仓库的视觉组件。而每个上游的 VLM 模型，它们的视觉模型可能互不相同，依赖库也各有区别。
+因此，LMDeploy 决定不在自身的依赖列表中加入上游 VLM 库的依赖。如果你在使用 LMDeploy 推理 VLM 模型时出现 "ImportError" 的问题，请自行安装相关的依赖。
+```
+
+服务成功启动后，你可以以类似访问 `gptv4` 服务的方式访问 VLM 服务：
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', # A dummy api_key is required
+                base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
+
+## 使用命令行与 LLM 模型对话
+
+LMDeploy 提供了一个非常方便的 CLI 工具，供用户与 LLM 模型进行本地聊天。例如：
+
+```shell
+lmdeploy chat internlm/internlm2_5-7b-chat --backend turbomind
+```
+
+它的设计目的是帮助用户检查和验证 LMDeploy 是否支持提供的模型，聊天模板是否被正确应用，以及推理结果是否正确。
+
+另外，`lmdeploy check_env` 收集基本的环境信息。在给 LMDeploy 提交问题报告时，这非常重要，因为它有助于我们更有效地诊断和解决问题。
+
+如果你对它们的使用方法有任何疑问，你可以尝试使用 `--help` 选项获取详细信息。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 5a6df1fe4c..8691c423b0 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -41,25 +41,9 @@ LMDeploy 工具箱提供以下核心功能：
    :maxdepth: 2
    :caption: 快速上手
 
+   installation.md
    get_started.md
 
-.. _编译和安装:
-.. toctree::
-   :maxdepth: 1
-   :caption: 编译和安装
-
-   build.md
-
-.. _测试基准:
-.. toctree::
-   :maxdepth: 1
-   :caption: 测试基准
-
-   benchmark/profile_generation.md
-   benchmark/profile_throughput.md
-   benchmark/profile_api_server.md
-   benchmark/evaluate_with_opencompass.md
-
 .. _支持的模型:
 .. toctree::
    :maxdepth: 1
@@ -67,25 +51,25 @@ LMDeploy 工具箱提供以下核心功能：
 
    supported_models/supported_models.md
 
-.. _推理:
+.. _llm_部署:
 .. toctree::
    :maxdepth: 1
-   :caption: 推理
+   :caption: 大语言模型(LLMs)部署
 
-   inference/pipeline.md
-   inference/vl_pipeline.md
+   llm/pipeline.md
+   llm/api_server.md
+   llm/api_server_tools.md
+   llm/gradio.md
+   llm/proxy_server.md
 
-
-.. _服务:
+.. _vlm_部署:
 .. toctree::
    :maxdepth: 1
-   :caption: 服务
+   :caption: 视觉-语言模型(VLMs)部署
 
-   serving/api_server.md
-   serving/api_server_vl.md
-   serving/api_server_tools.md
-   serving/gradio.md
-   serving/proxy_server.md
+   multi_modal/vl_pipeline.md
+   multi_modal/api_server_vl.md
+   multi_modal/index.rst
 
 
 .. _量化:
@@ -94,8 +78,18 @@ LMDeploy 工具箱提供以下核心功能：
    :caption: 量化
 
    quantization/w4a16.md
-   quantization/kv_quant.md
    quantization/w8a8.md
+   quantization/kv_quant.md
+
+.. _测试基准:
+.. toctree::
+   :maxdepth: 1
+   :caption: 测试基准
+
+   benchmark/profile_generation.md
+   benchmark/profile_throughput.md
+   benchmark/profile_api_server.md
+   benchmark/evaluate_with_opencompass.md
 
 .. toctree::
    :maxdepth: 1
@@ -107,7 +101,6 @@ LMDeploy 工具箱提供以下核心功能：
    advance/long_context.md
    advance/chat_template.md
    advance/debug_turbomind.md
-   serving/qos.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/installation.md
new file mode 100644
index 0000000000..c5758a6428
--- /dev/null
+++ b/docs/zh_cn/installation.md
@@ -0,0 +1,80 @@
+# 安装
+
+LMDeploy 是一个用于大型语言模型（LLMs）和视觉-语言模型（VLMs）压缩、部署和服务的 Python 库。
+其核心推理引擎包括 TurboMind 引擎和 PyTorch 引擎。前者由 C++ 和 CUDA 开发，致力于推理性能的优化，而后者纯 Python 开发，旨在降低开发者的门槛。
+
+LMDeploy 支持在 Linux 和 Windows 平台上部署 LLMs 和 VLMs，最低要求 CUDA 版本为 11.3。此外，它还与以下 NVIDIA GPU 兼容：
+
+Volta(sm70): V100
+Turing(sm75): 20 系列，T4
+Ampere(sm80,sm86): 30 系列，A10, A16, A30, A100
+Ada Lovelace(sm89): 40 系列
+
+## 使用 pip 安装（推荐）
+
+我们推荐在一个干净的conda环境下（python3.8 - 3.12），安装 lmdeploy：
+
+```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
+pip install lmdeploy
+```
+
+默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
+
+```shell
+export LMDEPLOY_VERSION=0.5.3
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+
+## 使用 pip 安装夜间构建包
+
+LMDeploy 的发布频率大约是每月一次或两次。如果你所需的功能已经被合并到 LMDeploy 的主分支但还没有发布，你可以环境中的 CUDA 和 Python 版本，尝试使用[这里](https://github.com/zhyncs/lmdeploy-build)提供的夜间构建包。
+
+## 从源码安装
+
+如果你使用 PyTorch 引擎进行推理，从源代码安装非常简单：
+
+```shell
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+pip install -e .
+```
+
+但如果你使用 TurboMind 引擎，请参考以下说明编译源代码。我们强烈推荐使用 `openmmlab/lmdeploy:{tag}` docker 镜像作为编译安装的环境
+
+**步骤 1** - 获取 LMDeploy 的 docker 镜像
+
+```shell
+docker pull openmmlab/lmdeploy:latest
+```
+
+```{note}
+"openmmlab/lmdeploy:latest" 基于 "nvidia/cuda:12.4.1-devel-ubuntu22.04"。如果你在带有 cuda 11+ 驱动的平台上工作，请使用 "openmmlab/lmdeploy:latest-cu11"。
+从 v0.5.3 开始，LMDeploy docker 镜像标签的模式是 "openmmlab/lmdeploy:{version}-cu(11|12)"。
+```
+
+**步骤 2** - 克隆 LMDeploy 源代码
+
+```shell
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+```
+
+**步骤 3** - 以交互模式启动 docker 容器
+
+```shell
+docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name lmdeploy -it openmmlab/lmdeploy:latest bin/bash
+```
+
+**步骤 4** - 编译与安装
+
+```shell
+cd /opt/lmdeploy
+mkdir -p build && cd build
+bash ../generate.sh make
+make -j$(nproc) && make install
+cd ..
+pip install -e .
+```
diff --git a/docs/zh_cn/serving/api_server.md b/docs/zh_cn/llm/api_server.md
similarity index 100%
rename from docs/zh_cn/serving/api_server.md
rename to docs/zh_cn/llm/api_server.md
diff --git a/docs/zh_cn/serving/api_server_tools.md b/docs/zh_cn/llm/api_server_tools.md
similarity index 100%
rename from docs/zh_cn/serving/api_server_tools.md
rename to docs/zh_cn/llm/api_server_tools.md
diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/llm/gradio.md
similarity index 100%
rename from docs/zh_cn/serving/gradio.md
rename to docs/zh_cn/llm/gradio.md
diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/llm/pipeline.md
similarity index 100%
rename from docs/zh_cn/inference/pipeline.md
rename to docs/zh_cn/llm/pipeline.md
diff --git a/docs/zh_cn/serving/proxy_server.md b/docs/zh_cn/llm/proxy_server.md
similarity index 100%
rename from docs/zh_cn/serving/proxy_server.md
rename to docs/zh_cn/llm/proxy_server.md
diff --git a/docs/zh_cn/serving/qos.md b/docs/zh_cn/llm/qos.md
similarity index 100%
rename from docs/zh_cn/serving/qos.md
rename to docs/zh_cn/llm/qos.md
diff --git a/docs/zh_cn/serving/api_server_vl.md b/docs/zh_cn/multi_modal/api_server_vl.md
similarity index 99%
rename from docs/zh_cn/serving/api_server_vl.md
rename to docs/zh_cn/multi_modal/api_server_vl.md
index 878edbbc6d..fea4d33ef3 100644
--- a/docs/zh_cn/serving/api_server_vl.md
+++ b/docs/zh_cn/multi_modal/api_server_vl.md
@@ -1,6 +1,6 @@
 # 部署 VLM 类 openai 服务
 
-本文主要介绍单个VL模型在单机多卡环境下，部署兼容 openai 接口服务的方式，以及服务接口的用法。为行文方便，我们把该服务名称为 `api_server`。对于多模型的并行服务，请阅读[请求分发服务器](./proxy_server.md)一文。
+本文主要介绍单个VL模型在单机多卡环境下，部署兼容 openai 接口服务的方式，以及服务接口的用法。为行文方便，我们把该服务名称为 `api_server`。对于多模型的并行服务，请阅读[请求分发服务器](../llm/proxy_server.md)一文。
 
 在这篇文章中， 我们首先介绍服务启动的两种方法，你可以根据应用场景，选择合适的。
 
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
new file mode 100644
index 0000000000..c27b420e28
--- /dev/null
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -0,0 +1,12 @@
+视觉语言模型
+=================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 示例
+
+   llava.md
+   internvl.md
+   xcomposer2d5.md
+   cogvlm.md
+   minicpmv.md
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
new file mode 100644
index 0000000000..0b204cb920
--- /dev/null
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -0,0 +1,3 @@
+# InternVL
+
+TODO
diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
new file mode 100644
index 0000000000..cf95e15d5c
--- /dev/null
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -0,0 +1,3 @@
+# LLaVA
+
+TODO
diff --git a/docs/zh_cn/inference/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
similarity index 99%
rename from docs/zh_cn/inference/vl_pipeline.md
rename to docs/zh_cn/multi_modal/vl_pipeline.md
index b550d70e75..31533b38f7 100644
--- a/docs/zh_cn/inference/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -1,6 +1,6 @@
 # VLM 离线推理 pipeline
 
-LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单好用的 pipeline。它的用法与大语言模型（LLM）推理 [pipeline](./pipeline.md) 类似。
+LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单好用的 pipeline。它的用法与大语言模型（LLM）推理 [pipeline](../llm/pipeline.md) 类似。
 
 目前，VLM pipeline 支持以下模型：
 
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index 4b42f39706..de67293543 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -87,7 +87,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
-关于 pipeline 的详细介绍，请参考[这里](../inference/pipeline.md)
+关于 pipeline 的详细介绍，请参考[这里](../llm/pipeline.md)
 
 除了推理本地量化模型外，LMDeploy 还支持直接推理 huggingface hub 上的通过 AWQ 量化的 4bit 权重模型，比如 [lmdeploy 空间](https://huggingface.co/lmdeploy)和 [TheBloke 空间](https://huggingface.co/TheBloke)下的模型。
 
@@ -123,7 +123,7 @@ lmdeploy serve api_server ./internlm2_5-7b-chat-4bit --backend turbomind --model
 lmdeploy serve api_client http://0.0.0.0:23333
 ```
 
-还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口，也可直接查阅[文档](../serving/api_server.md)，了解各接口的定义和使用方法。
+还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口，也可直接查阅[文档](../llm/api_server.md)，了解各接口的定义和使用方法。
 
 ## 推理性能
 
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
index 2a8a863302..b9e881c058 100644
--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -110,4 +110,4 @@ lmdeploy serve api_client api_server_url
 lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
 ```
 
-关于 RESTful API的详细介绍，请参考[这份](../serving/api_server.md)文档。
+关于 RESTful API的详细介绍，请参考[这份](../llm/api_server.md)文档。
diff --git a/generate.sh b/generate.sh
index 3c6b5bceed..0c25b8cbf2 100755
--- a/generate.sh
+++ b/generate.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 WORKSPACE_PATH=$(dirname "$(readlink -f "$0")")
 
 builder="-G Ninja"

From 87266b9dc77485919abbc74202bfab11ccf15b50 Mon Sep 17 00:00:00 2001
From: "q.yao" <streetyao@live.com>
Date: Wed, 7 Aug 2024 16:33:04 +0800
Subject: [PATCH 03/39] clarify pytorch engine does not suport  baichuan2 7b
 awq (#2246)

---
 docs/en/supported_models/supported_models.md    | 2 +-
 docs/zh_cn/supported_models/supported_models.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index bf23e17ae7..40300d82c1 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -48,7 +48,7 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index b9e021524a..fca936c6f2 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -48,7 +48,7 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
 |   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
 |  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
 |   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
 |    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
 |     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |

From 061f99736544c8bf574309d47baf574b69ab7eaf Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 7 Aug 2024 17:22:23 +0800
Subject: [PATCH 04/39] update test prtest image (#2192)

* update

* update

* Update pr_ete_test.yml

* Update pr_ete_test.yml

* Update pr_ete_test.yml

* update

* update

* update

* update

* update

* update

* update

* update

* update

* updatr

* update

* update

* update

* updaet

* update

* update

* updaste

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
---
 .github/workflows/pr_ete_test.yml             | 57 +++++++++----------
 .../chat/test_command_chat_hf_pytorch.py      |  1 +
 .../chat/test_command_chat_hf_turbomind.py    |  1 +
 .../tools/chat/test_command_chat_workspace.py |  1 +
 autotest/tools/convert/test_convert.py        |  1 +
 .../pipeline/test_pipeline_chat_pytorch.py    |  1 +
 .../pipeline/test_pipeline_chat_turbomind.py  |  1 +
 .../test_pipeline_chat_turbomind_vl.py        | 17 ++++++
 .../quantization/test_quantization_w4a16.py   |  1 +
 .../restful/test_restful_chat_hf_turbomind.py |  8 ++-
 .../restful/test_restful_chat_workspace.py    |  8 ++-
 autotest/utils/run_restful_chat.py            |  6 +-
 12 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
index 00e3db24ce..7c0ae8a24d 100644
--- a/.github/workflows/pr_ete_test.yml
+++ b/.github/workflows/pr_ete_test.yml
@@ -24,6 +24,7 @@ env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
 
 
 jobs:
@@ -33,58 +34,54 @@ jobs:
     env:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
-      image: nvcr.io/nvidia/tritonserver:24.03-py3
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Setup systems
         run: |
-          rm /etc/apt/sources.list.d/cuda*.list
-          apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
-              libgoogle-glog-dev libgl1 openjdk-8-jre-headless
-          rm -rf /var/lib/apt/lists/*
+          rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
+          add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
+          ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
+          && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
+          echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
       - name: Clone repository
         uses: actions/checkout@v2
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
-          python3 -m pip install torch==2.1.0 torchvision==0.16.0
+          python3 -m pip install --upgrade pip setuptools==69.5.1
+          python3 -m pip install torch==2.3.0 torchvision==0.18.0
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-2.5.8+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu123torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
       - name: Build lmdeploy
         run: |
-          python3 -m pip install cmake
-          python3 -m pip install -r requirements/build.txt
-          mkdir build
-          cd build
-          cp -r /nvme/qa_test_models/offline_pkg/_deps .
-          cmake .. \
-              -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-              -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
-              -DBUILD_PY_FFI=ON \
-              -DBUILD_MULTI_GPU=ON \
-              -DCMAKE_CUDA_FLAGS="-lineinfo" \
-              -DUSE_NVTX=ON \
-              -DSM=80 \
-              -DCMAKE_CUDA_ARCHITECTURES=80 \
-              -DBUILD_TEST=OFF
-          make -j$(nproc) && make install
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip install packaging transformers_stream_generator transformers datasets openai einops
-          python3 -m pip install -r requirements.txt -r requirements/test.txt
-          python3 -m pip install .
+          cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
+          tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
+          make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
+          export PATH=$PATH:/usr/local/openmpi/bin
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
+          python3 -m pip install cmake packaging wheel transformers_stream_generator transformers datasets openai einops timm decord
+          python3 -m pip install -r requirements.txt -r requirements/test.txt -r requirements/build.txt
+          mkdir -p build && cd build &&\
+          sh ../generate.sh &&\
+          ninja -j$(nproc) && ninja install &&\
+          cd .. &&\
+          python3 -m pip install -e . &&\
+          rm -rf build
       - name: Check env
         run: |
           python3 -m pip list
           lmdeploy check_env
       - name: Test lmdeploy
-        run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -x --alluredir=allure-results --clean-alluredir
+        run: |
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
       - name: Generate reports
         if: always()
         run: |
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index dc09bdc944..3ee8608604 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -54,6 +54,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
 def test_hf_pytorch_chat_pr(config, model, cli_case_config):
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index c659b9ae6b..5f5e1fde59 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -104,6 +104,7 @@ def test_hf_turbomind_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',
diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py
index d73468877e..20f6d0e144 100644
--- a/autotest/tools/chat/test_command_chat_workspace.py
+++ b/autotest/tools/chat/test_command_chat_workspace.py
@@ -95,6 +95,7 @@ def test_workspace_base_tp2(config, cli_case_config, model, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.command_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',
diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py
index 4c36d5302e..ed5214849f 100644
--- a/autotest/tools/convert/test_convert.py
+++ b/autotest/tools/convert/test_convert.py
@@ -20,6 +20,7 @@ def test_convert(config, model, worker_id):
 
 @pytest.mark.order(5)
 @pytest.mark.convert
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
index b4910742d2..b88422bfec 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
@@ -59,6 +59,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
 def test_pipeline_chat_pytorch_pr(config, common_case_config, model):
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
index a84dace34f..6373549698 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
@@ -98,6 +98,7 @@ def test_pipeline_chat_kvint_tp2(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
index 63e09bf8f6..b02aa21fd3 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
@@ -32,3 +32,20 @@ def test_pipeline_chat_tp2(config, model, worker_id):
     p.start()
     p.join()
     assert_pipeline_vl_chat_log(config, model)
+
+
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('model', [
+    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
+    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
+])
+def test_pipeline_pr_test(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(
+            int(get_cuda_id_by_workerid(worker_id)) + 5)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
diff --git a/autotest/tools/quantization/test_quantization_w4a16.py b/autotest/tools/quantization/test_quantization_w4a16.py
index 118432ba07..b81a3b02f0 100644
--- a/autotest/tools/quantization/test_quantization_w4a16.py
+++ b/autotest/tools/quantization/test_quantization_w4a16.py
@@ -19,6 +19,7 @@ def test_quantization_w4a16(config, model, worker_id):
 @pytest.mark.order(3)
 @pytest.mark.quantization_w4a16
 @pytest.mark.pr_test
+@pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize(
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
index 4d86c60d9b..4046cf38c2 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
@@ -114,6 +114,7 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'internlm/internlm2-chat-20b',
@@ -126,7 +127,12 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
 }],
                          indirect=True)
 def test_restful_chat_pr(config, common_case_config):
-    run_all_step(config, common_case_config)
+    run_all_step(
+        config, {
+            key: value
+            for key, value in common_case_config.items()
+            if key == 'memory_test'
+        })
 
 
 @pytest.mark.order(7)
diff --git a/autotest/tools/restful/test_restful_chat_workspace.py b/autotest/tools/restful/test_restful_chat_workspace.py
index 61afc72ca6..a798f4ea6a 100644
--- a/autotest/tools/restful/test_restful_chat_workspace.py
+++ b/autotest/tools/restful/test_restful_chat_workspace.py
@@ -65,6 +65,7 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'internlm/internlm2-chat-20b',
@@ -77,4 +78,9 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 }],
                          indirect=True)
 def test_restful_chat_pr(config, common_case_config):
-    run_all_step(config, common_case_config)
+    run_all_step(
+        config, {
+            key: value
+            for key, value in common_case_config.items()
+            if key == 'memory_test'
+        })
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index c659ea7720..92e353487e 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -5,6 +5,7 @@
 from time import sleep, time
 
 import allure
+import psutil
 from pytest import assume
 from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
 from utils.get_run_config import get_command_with_extra
@@ -92,7 +93,10 @@ def start_restful_api(config, param, model, model_path, backend_tpye,
 
 def stop_restful_api(pid, startRes, param):
     if pid > 0:
-        startRes.terminate()
+        parent = psutil.Process(pid)
+        for child in parent.children(recursive=True):
+            child.terminate()
+        parent.terminate()
     if 'modelscope' in param.keys():
         modelscope = param['modelscope']
         if modelscope:

From fb6c5a1d78b34a86a85b3e05212d19d9c57acefc Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 8 Aug 2024 11:46:58 +0800
Subject: [PATCH 05/39] Remove deprecated arguments from API and clarify
 model_name and chat_template_name (#1931)

* make model_name optional

* remove model_name from turbomind engine

* add chat_template_name in turbomind model config

* tell model_name and chat_template_name apart

* test chat.py

* develop get_tm_model

* remove get_hf_config_content

* remove to_file since it is indicated by out_dir

* minor fix

* add test_async_engine.py

* remove tp from class AsyncEngine

* --chat-template can be a string

* remove deprecates

* fix ut

* fix when test chatting

* fix lmdeploy convert tc

* update CLI

* update

* fix tc

* fix

* fix according to reviewer comments

* update

* update

* update

* update

* update

* rollback user guide

* fix

* fix typo

* rm trust_remote_code from cli

* fix typo

* update

* fix linting

* fix linting

* fix lint

* fix profile_generation

* fix docstring
---
 .gitignore                                    |   1 +
 README.md                                     |   2 +-
 README_ja.md                                  |   2 +-
 README_zh-CN.md                               |   2 +-
 autotest/tools/convert/test_convert.py        |   4 +-
 benchmark/profile_generation.py               |   4 +-
 .../en/benchmark/evaluate_with_opencompass.md |   4 +-
 docs/en/get_started.md                        |   7 +-
 docs/en/llm/codellama.md                      | 165 +++++++++++++++++
 docs/en/multi_modal/cogvlm.md                 |  14 +-
 docs/en/multi_modal/minicpmv.md               |   8 +-
 docs/en/multi_modal/xcomposer2d5.md           |   7 +-
 docs/en/supported_models/codellama.md         | 111 ------------
 .../benchmark/evaluate_with_opencompass.md    |   2 +-
 docs/zh_cn/llm/codellama.md                   | 165 +++++++++++++++++
 docs/zh_cn/multi_modal/cogvlm.md              |  13 +-
 docs/zh_cn/multi_modal/minicpmv.md            |   8 +-
 docs/zh_cn/multi_modal/xcomposer2d5.md        |   7 +-
 docs/zh_cn/supported_models/codellama.md      | 113 ------------
 lmdeploy/api.py                               |  27 +--
 lmdeploy/archs.py                             |  11 +-
 lmdeploy/cli/cli.py                           |  63 ++-----
 lmdeploy/cli/lite.py                          |  46 -----
 lmdeploy/cli/serve.py                         |  31 +---
 lmdeploy/cli/utils.py                         |  63 +++----
 lmdeploy/messages.py                          |   4 -
 lmdeploy/pytorch/chat.py                      |  33 ++--
 lmdeploy/pytorch/engine/engine.py             |   2 -
 lmdeploy/serve/async_engine.py                | 100 ++---------
 lmdeploy/serve/openai/api_server.py           |   9 +-
 lmdeploy/turbomind/chat.py                    |  22 +--
 lmdeploy/turbomind/deploy/converter.py        |  94 +++++-----
 .../turbomind/deploy/target_model/base.py     |   4 +-
 lmdeploy/turbomind/deploy/target_model/fp.py  |   3 +-
 .../turbomind/deploy/target_model/plora.py    |   3 +-
 .../turbomind/deploy/target_model/plora_w4.py |   3 +-
 lmdeploy/turbomind/deploy/target_model/w4.py  |   3 +-
 lmdeploy/turbomind/turbomind.py               | 168 +++++-------------
 lmdeploy/utils.py                             |  17 --
 lmdeploy/vl/templates.py                      |   6 +-
 tests/test_lmdeploy/test_async_engine.py      |  61 ++++---
 41 files changed, 598 insertions(+), 814 deletions(-)
 create mode 100644 docs/en/llm/codellama.md
 delete mode 100644 docs/en/supported_models/codellama.md
 create mode 100644 docs/zh_cn/llm/codellama.md
 delete mode 100644 docs/zh_cn/supported_models/codellama.md

diff --git a/.gitignore b/.gitignore
index 4bc25bce48..66a56fc26b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,7 @@ dist/
 examples/cpp/llama/*.csv
 *.npy
 *.weight
+install/
 
 # LMDeploy
 workspace/
diff --git a/README.md b/README.md
index 4a6ecdf51b..1c7f368ee3 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ ______________________________________________________________________
 - \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
 - \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
-- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
+- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/llm/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
diff --git a/README_ja.md b/README_ja.md
index 62b77e2149..16a5f4bd70 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -53,7 +53,7 @@ ______________________________________________________________________
 - \[2023/11\] TurboMindの主要なアップグレード、包括的なPaged Attention、シーケンス長制限のない高速なアテンションカーネル、2倍速いKV8カーネル、Split-Kデコーディング（Flash Decoding）、およびsm_75のW4A16推論
 - \[2023/09\] TurboMindはQwen-14Bをサポート
 - \[2023/09\] TurboMindはInternLM-20Bをサポート
-- \[2023/09\] TurboMindはCode Llamaのすべての機能をサポート：コード補完、インフィリング、チャット/インストラクト、Pythonスペシャリスト。デプロイメントガイドは[こちら](./docs/en/supported_models/codellama.md)をクリックしてください
+- \[2023/09\] TurboMindはCode Llamaのすべての機能をサポート：コード補完、インフィリング、チャット/インストラクト、Pythonスペシャリスト。デプロイメントガイドは[こちら](./docs/en/llm/codellama.md)をクリックしてください
 - \[2023/09\] TurboMindはBaichuan2-7Bをサポート
 - \[2023/08\] TurboMindはflash-attention2をサポート
 - \[2023/08\] TurboMindはQwen-7B、動的NTK-RoPEスケーリング、動的logNスケーリングをサポート
diff --git a/README_zh-CN.md b/README_zh-CN.md
index b7d5634fa5..f82e299c37 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -53,7 +53,7 @@ ______________________________________________________________________
 - \[2023/11\] TurboMind 重磅升级。包括：Paged Attention、更快的且不受序列最大长度限制的 attention kernel、2+倍快的 KV8 kernels、Split-K decoding (Flash Decoding) 和 支持 sm_75 架构的 W4A16
 - \[2023/09\] TurboMind 支持 Qwen-14B
 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
-- \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
+- \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/llm/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
 - \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py
index ed5214849f..8c7ee15334 100644
--- a/autotest/tools/convert/test_convert.py
+++ b/autotest/tools/convert/test_convert.py
@@ -40,7 +40,7 @@ def convert(config, model_case, cuda_prefix):
                               or 'awq' in model_case.lower()):
         cmd = get_command_with_extra(' '.join([
             'lmdeploy convert', model_name, origin_model_path, '--dst-path',
-            dst_path, '--model-format awq --group-size 128 --trust-remote-code'
+            dst_path, '--model-format awq --group-size 128'
         ]),
                                      config,
                                      model_case,
@@ -49,7 +49,7 @@ def convert(config, model_case, cuda_prefix):
     else:
         cmd = get_command_with_extra(' '.join([
             'lmdeploy convert', model_name, origin_model_path, '--dst-path',
-            dst_path, '--trust-remote-code'
+            dst_path
         ]),
                                      config,
                                      model_case,
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index da0ddbeab3..81de3dbf45 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -78,14 +78,12 @@ def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int,
         return
 
     print('start to warmup ...')
-    output_seqlen = gen_config.max_new_tokens
 
     def _infer(model, session_id):
         chatbot = model.create_instance()
         for _ in range(warmup_round):
             for _ in chatbot.stream_infer(session_id,
                                           input_ids=input_ids,
-                                          request_output_len=output_seqlen,
                                           sequence_start=True,
                                           sequence_end=True,
                                           ignore_eos=True,
@@ -197,7 +195,7 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
           f'token_latency percentiles(50%,75%,95%,99%)(s): {percentiles}\n'
           f'throughput(output): {out_token_throughput} token/s\n'
           f'throughput(total): {total_token_throughput} token/s\n{"-" * 50}')
-    return tm_model.model_name, \
+    return model_path, \
         [first_token_latency_min, first_token_latency_max,
          first_token_latency_ave], \
         percentiles, out_token_throughput, total_token_throughput, \
diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md
index f078c6e448..574b9ed506 100644
--- a/docs/en/benchmark/evaluate_with_opencompass.md
+++ b/docs/en/benchmark/evaluate_with_opencompass.md
@@ -141,8 +141,8 @@ models = [internlm_chat_20b]
 
 **Note**
 
-- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#turbomindengineconfig)
-  and [EngineGenerationConfig](https://lmdeploy.readthedocs.io/en/latest/inference/pipeline.html#generationconfig)
+- If you want to pass more arguments for `engine_config`和`gen_config` in the evaluation config file, please refer to [TurbomindEngineConfig](https://github.com/InternLM/lmdeploy/blob/061f99736544c8bf574309d47baf574b69ab7eaf/lmdeploy/messages.py#L114)
+  and [EngineGenerationConfig](https://github.com/InternLM/lmdeploy/blob/061f99736544c8bf574309d47baf574b69ab7eaf/lmdeploy/messages.py#L56)
 
 ## Execute Evaluation Task
 
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 76045aebab..311980536f 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -48,8 +48,11 @@ pipe = pipeline('internlm/internlm2_5-7b-chat',
 ```
 
 ```{note}
-The parameter "cache_max_entry_count" significantly influences the GPU memory usage. It means the proportion of FREE GPU memory occupied by the K/V cache after the model weights are loaded.
-The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline and the api_server mentioned later in the next consumes a substantial amount of GPU memory.
+The parameter "cache_max_entry_count" significantly influences the GPU memory usage.
+It means the proportion of FREE GPU memory occupied by the K/V cache after the model weights are loaded.
+
+The default value is 0.8. The K/V cache memory is allocated once and reused repeatedly, which is why it is observed that the built pipeline and the "api_server" mentioned later in the next consumes a substantial amount of GPU memory.
+
 If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of "cache_max_entry_count".
 ```
 
diff --git a/docs/en/llm/codellama.md b/docs/en/llm/codellama.md
new file mode 100644
index 0000000000..b0b2f27e3a
--- /dev/null
+++ b/docs/en/llm/codellama.md
@@ -0,0 +1,165 @@
+# codellama
+
+## Introduction
+
+[codellama](https://github.com/facebookresearch/codellama) features enhanced coding capabilities. It can generate code and natural language about code, from both code and natural language prompts (e.g., “Write me a function that outputs the fibonacci sequence”). It can also be used for code completion and debugging. It supports many of the most popular programming languages used today, including Python, C++, Java, PHP, Typescript (Javascript), C#, Bash and more.
+
+There are three sizes (7b, 13b, 34b) as well as three flavours (base model, Python fine-tuned, and instruction tuned) released on [HuggingFace](https://huggingface.co/codellama).
+
+| Base Model                                                                      | Python                                                                                        | Instruct                                                                                          |
+| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
+| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
+
+The correspondence between the model and capabilities is:
+
+| models     | code completion | infilling         | instructions / chat | python specialist |
+| ---------- | --------------- | ----------------- | ------------------- | ----------------- |
+| Base Model | Y               | Y(7B,13B), N(34B) | N                   | N                 |
+| Python     | Y               | N                 | N                   | Y                 |
+| Instruct   | Y               | Y(7B,13B), N(34B) | Y                   | N                 |
+
+## Inference
+
+Based on the above table, this section shows how to utilize CodeLlama's capabilities by examples
+
+### Completion
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='completion'
+                ))
+
+response = pipe(
+    'import socket\n\ndef ping_exponential_backoff(host: str):',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+### Infilling
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='infilling'
+                ))
+
+prompt = """
+def remove_non_ascii(s: str) -> str:
+    \"\"\"
+    <FILL>
+    \"\"\"
+    return result
+"""
+response = pipe(
+    prompt,
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95,
+        max_new_tokens=500
+    )
+)
+print(response.text)
+```
+
+### Chat
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-Instruct-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='chat'
+                ))
+
+response = pipe(
+    'implement quick sort in C++',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+### Python specialist
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-Python-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='python'
+                ))
+
+response = pipe(
+    'implement quick sort',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+## Quantization
+
+TBD
+
+## Serving
+
+Prepare a chat template json file, for instance "codellama.json", with the following content:
+
+```json
+{
+    "model_name": "codellama",
+    "capability": "completion"
+}
+```
+
+Then launch the service as follows:
+
+```shell
+lmdeploy serve api_server meta-llama/CodeLlama-7b-Instruct-hf --chat-template codellama.json
+```
+
+After the service is launched successfully, you can access the service with `openai` package:
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "user", "content": "import socket\n\ndef ping_exponential_backoff(host: str):"},
+  ],
+    temperature=0.1,
+    top_p=0.95,
+    max_tokens=500
+)
+print(response)
+```
+
+Regarding the detailed information of the api_server, you can refer to the [guide](../llm/api_server.md).
diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md
index fcffdeb3c3..d2114e574c 100644
--- a/docs/en/multi_modal/cogvlm.md
+++ b/docs/en/multi_modal/cogvlm.md
@@ -17,17 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https:
 pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121
 ```
 
-Install LMDeploy with pip (Python 3.8+). Refer to [Installation](https://lmdeploy.readthedocs.io/en/latest/get_started.html#installation) for more.
-
-```shell
-# cuda 11.8
-# to get the latest version, run: pip index versions lmdeploy
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-# cuda 12.1
-pip install lmdeploy
-```
+Install LMDeploy by following the [installation guide](../installation.md)
 
 ### Prepare
 
@@ -43,7 +33,7 @@ huggingface-cli download lmsys/vicuna-7b-v1.5 special_tokens_map.json tokenizer.
 
 ### Offline inference pipeline
 
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html#vlm-offline-inference-pipeline)
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
index efc4ec823c..73bacf82a4 100644
--- a/docs/en/multi_modal/minicpmv.md
+++ b/docs/en/multi_modal/minicpmv.md
@@ -6,15 +6,11 @@
 
 ## Quick Start
 
-Install LMDeploy with pip (Python 3.8+). Refer to [Installation](https://lmdeploy.readthedocs.io/en/latest/get_started.html#installation) for more.
-
-```shell
-pip install lmdeploy
-```
+Please install LMDeploy by following the [installation guide](../installation.md)
 
 ### Offline inference pipeline
 
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html#vlm-offline-inference-pipeline)
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/en/multi_modal/xcomposer2d5.md b/docs/en/multi_modal/xcomposer2d5.md
index 81903c128e..d6883c0023 100644
--- a/docs/en/multi_modal/xcomposer2d5.md
+++ b/docs/en/multi_modal/xcomposer2d5.md
@@ -8,18 +8,15 @@
 
 ### Installation
 
-Install LMDeploy with pip (Python 3.8+). Refer to [Installation](https://lmdeploy.readthedocs.io/en/latest/get_started.html#installation) for more.
+Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternLM-XComposer-2.5 needs
 
 ```shell
-pip install lmdeploy
-
-# install other packages that InternLM-XComposer-2.5 needs
 pip install decord
 ```
 
 ### Offline inference pipeline
 
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html#vlm-offline-inference-pipeline)
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
deleted file mode 100644
index 5ef5bfa691..0000000000
--- a/docs/en/supported_models/codellama.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# codellama
-
-## Introduction
-
-[codellama](https://github.com/facebookresearch/codellama) features enhanced coding capabilities. It can generate code and natural language about code, from both code and natural language prompts (e.g., “Write me a function that outputs the fibonacci sequence”). It can also be used for code completion and debugging. It supports many of the most popular programming languages used today, including Python, C++, Java, PHP, Typescript (Javascript), C#, Bash and more.
-
-There are three sizes (7b, 13b, 34b) as well as three flavours (base model, Python fine-tuned, and instruction tuned) released on [HuggingFace](https://huggingface.co/codellama).
-
-| Base Model                                                                      | Python                                                                                        | Instruct                                                                                          |
-| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
-| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
-| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
-| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
-
-The correspondence between the model and capabilities is:
-
-| models     | code completion | infilling         | instructions / chat | python specialist |
-| ---------- | --------------- | ----------------- | ------------------- | ----------------- |
-| Base Model | Y               | Y(7B,13B), N(34B) | N                   | N                 |
-| Python     | Y               | N                 | N                   | Y                 |
-| Instruct   | Y               | Y(7B,13B), N(34B) | Y                   | N                 |
-
-## Inference
-
-Based on the above table, download the model that meets your requirements. Execute the following command to interact with the model in the console:
-
-```shell
-# install lmdeploy
-python3 -m pip install lmdeploy[all]
-
-# convert weight layout
-lmdeploy convert codellama /the/path/of/codellama/model
-```
-
-Then, you can communicate with codellama in consolo by following instructions in next sections
-
-**Note**:
-
-- minimum requirement of `transformers` is **v4.33.0**
-- lmdeploy supports copying code blocks to the console. But you have to press enter, input "!!" and press enter again to end the prompt. The way to input prompt for other supported models keeps unchanged, i.e., double pressing enter.
-
-### Completion
-
-```shell
-lmdeploy chat ./workspace --cap completion
-```
-
-### Infilling
-
-```shell
-lmdeploy chat ./workspace --cap infilling
-```
-
-The input code is supposed to have a special placeholder `<FILL>`. For example,
-
-```
-def remove_non_ascii(s: str) -> str:
-    """ <FILL>
-    return result
-```
-
-And the generated code piece by `turbomind.chat` is the one to be filled in `<FILL>`
-
-### Chat
-
-```
-lmdeploy chat ./workspace --cap chat --meta-instruct "Provide answers in Python"
-```
-
-`--meta-instruct` instruction can be changed to other coding languages as long as codellama supports it
-
-### Python specialist
-
-```
-lmdeploy chat ./workspace --cap python
-```
-
-Python fine-tuned model is highly recommended when 'python specialist' capability is required.
-
-## Quantization
-
-TBD
-
-## Serving
-
-**LMDeploy server only supports `chat` capabllity**. The res ones are going to be supported soon.
-
-Launch inference server by:
-
-```shell
-# --tp: the number of GPUs used in tensor parallelism
-lmdeploy serve api_server ./workspace --server-name ${server_ip} --server-port ${server_port} --tp 1
-```
-
-Then, you can communicate with it by command line,
-
-```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-lmdeploy serve api_client api_server_url
-```
-
-or through webui after launching gradio,
-
-```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
-# server_ip and server_port here are for gradio ui
-# example: lmdeploy serve gradio http://localhost:23333 --server-name localhost --server-port 6006
-lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
-```
-
-Regarding the detailed information of RESTful API, you can refer to the [guide](../llm/api_server.md).
diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
index d45c8b28a0..94ba5326bb 100644
--- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md
+++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
@@ -139,7 +139,7 @@ models = [internlm_chat_20b]
 
 **注**
 
-- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#turbomindengineconfig) 和 [EngineGenerationConfig](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/pipeline.html#generationconfig)
+- 如果想在测评配置文件中`engine_config`和`gen_config`字段传递更多参数，请参考[TurbomindEngineConfig](https://github.com/InternLM/lmdeploy/blob/061f99736544c8bf574309d47baf574b69ab7eaf/lmdeploy/messages.py#L114) 和 [EngineGenerationConfig](https://github.com/InternLM/lmdeploy/blob/061f99736544c8bf574309d47baf574b69ab7eaf/lmdeploy/messages.py#L56)
 
 ## 执行测评任务
 
diff --git a/docs/zh_cn/llm/codellama.md b/docs/zh_cn/llm/codellama.md
new file mode 100644
index 0000000000..945e866a5d
--- /dev/null
+++ b/docs/zh_cn/llm/codellama.md
@@ -0,0 +1,165 @@
+# Code Llama
+
+## 模型介绍
+
+[codellama](https://github.com/facebookresearch/codellama) 支持很多种编程语言，包括 Python, C++, Java, PHP, Typescript (Javascript), C#, Bash 等等。具备代码续写、代码填空、对话、python专项等 4 种能力。
+
+它在 [HuggingFace](https://huggingface.co/codellama) 上发布了基座模型，Python模型和指令微调模型：
+
+| 基座模型                                                                        | Python微调模型                                                                                | 指令模型                                                                                          |
+| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
+| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
+
+模型和能力的对应关系为：
+
+| 模型           | 代码续写 | 代码填空          | 对话 | Python专项 |
+| -------------- | -------- | ----------------- | ---- | ---------- |
+| 基座模型       | Y        | Y(7B,13B), N(34B) | N    | N          |
+| Python微调模型 | Y        | N                 | N    | Y          |
+| 指令微调模型   | Y        | Y(7B,13B), N(34B) | Y    | N          |
+
+## 推理
+
+根据前文模型的能力表，在本小节中，我们讲通过具体的示例展示使用 CodeLlama 各能力的方法
+
+### 代码续写
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='completion'
+                ))
+
+response = pipe(
+    'import socket\n\ndef ping_exponential_backoff(host: str):',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+### 代码填空
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='infilling'
+                ))
+
+prompt = """
+def remove_non_ascii(s: str) -> str:
+    \"\"\"
+    <FILL>
+    \"\"\"
+    return result
+"""
+response = pipe(
+    prompt,
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95,
+        max_new_tokens=500
+    )
+)
+print(response.text)
+```
+
+### 对话
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-Instruct-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='chat'
+                ))
+
+response = pipe(
+    'implement quick sort in C++',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+### Python 专项
+
+```python
+from lmdeploy import pipeline, GenerationConfig, ChatTemplateConfig
+
+pipe = pipeline('meta-llama/CodeLlama-7b-Python-hf',
+                chat_template_config=ChatTemplateConfig(
+                    model_name='codellama',
+                    capability='python'
+                ))
+
+response = pipe(
+    'implement quick sort',
+    gen_config=GenerationConfig(
+        top_k=10,
+        temperature=0.1,
+        top_p=0.95
+    )
+)
+print(response.text)
+```
+
+## 量化
+
+TBD
+
+## 服务
+
+准备好对话模板文件，比如说“codellama.json”，参考如下示例，填写 CodeLlama 的能力：
+
+```json
+{
+    "model_name": "codellama",
+    "capability": "completion"
+}
+```
+
+然后，启动推理服务：
+
+```shell
+lmdeploy serve api_server meta-llama/CodeLlama-7b-Instruct-hf --chat-template codellama.json
+```
+
+在服务启动成功后，可以通过`openai`客户端接口，访问服务：
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "user", "content": "import socket\n\ndef ping_exponential_backoff(host: str):"},
+  ],
+    temperature=0.1,
+    top_p=0.95,
+    max_tokens=500
+)
+print(response)
+```
+
+关于 api_server 的详细介绍，请参考[这份](../llm/api_server.md)文档。
diff --git a/docs/zh_cn/multi_modal/cogvlm.md b/docs/zh_cn/multi_modal/cogvlm.md
index 91ae9a3d4a..131ad4f4aa 100644
--- a/docs/zh_cn/multi_modal/cogvlm.md
+++ b/docs/zh_cn/multi_modal/cogvlm.md
@@ -17,16 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https:
 pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121
 ```
 
-使用 pip(Python 3.8+)安装LMDeploy，更多安装方式参考 [安装](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html#id2)。
-
-```shell
-# cuda 11.8
-export LMDEPLOY_VERSION=0.5.3
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-# cuda 12.1
-pip install lmdeploy
-```
+请参考[安装文档](../installation.md)安装 LMDeploy
 
 ### 准备
 
@@ -41,7 +32,7 @@ huggingface-cli download lmsys/vicuna-7b-v1.5 special_tokens_map.json tokenizer.
 
 ### 离线推理 pipeline
 
-以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/vl_pipeline.html#vlm-pipeline)
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
index 6a02a94c20..20fc1eac92 100644
--- a/docs/zh_cn/multi_modal/minicpmv.md
+++ b/docs/zh_cn/multi_modal/minicpmv.md
@@ -8,15 +8,11 @@
 
 ### 安装
 
-使用 pip(Python 3.8+) 安装 LMDeploy，更多安装方式参考 [安装](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html#id2)。
-
-```shell
-pip install lmdeploy
-```
+请参考[安装文档](../installation.md)安装 LMDeploy
 
 ### 离线推理 pipeline
 
-以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/vl_pipeline.html#vlm-pipeline)
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/zh_cn/multi_modal/xcomposer2d5.md b/docs/zh_cn/multi_modal/xcomposer2d5.md
index 551ac5f026..31973a4941 100644
--- a/docs/zh_cn/multi_modal/xcomposer2d5.md
+++ b/docs/zh_cn/multi_modal/xcomposer2d5.md
@@ -8,18 +8,15 @@
 
 ### 安装
 
-使用 pip(Python 3.8+) 安装 LMDeploy，更多安装方式参考 [安装](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html#id2)。
+请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。
 
 ```shell
-pip install lmdeploy
-
-# install other packages that InternLM-XComposer-2.5 needs
 pip install decord
 ```
 
 ### 离线推理 pipeline
 
-以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](https://lmdeploy.readthedocs.io/zh-cn/latest/inference/vl_pipeline.html#vlm-pipeline)
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
deleted file mode 100644
index b9e881c058..0000000000
--- a/docs/zh_cn/supported_models/codellama.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Code Llama
-
-## 模型介绍
-
-[codellama](https://github.com/facebookresearch/codellama) 支持很多种编程语言，包括 Python, C++, Java, PHP, Typescript (Javascript), C#, Bash 等等。具备代码续写、代码填空、对话、python专项等 4 种能力。
-
-它在 [HuggingFace](https://huggingface.co/codellama) 上发布了基座模型，Python模型和指令微调模型：
-
-| 基座模型                                                                        | Python微调模型                                                                                | 指令模型                                                                                          |
-| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
-| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
-| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
-| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
-
-模型和能力的对应关系为：
-
-| 模型           | 代码续写 | 代码填空          | 对话 | Python专项 |
-| -------------- | -------- | ----------------- | ---- | ---------- |
-| 基座模型       | Y        | Y(7B,13B), N(34B) | N    | N          |
-| Python微调模型 | Y        | N                 | N    | Y          |
-| 指令微调模型   | Y        | Y(7B,13B), N(34B) | Y    | N          |
-
-## 推理
-
-根据上述的模型和能力关系表，下载感兴趣的模型。执行如下的命令，把模型权重转成 turbomind 要求的格式：
-
-```shell
-# 安装 lmdeploy
-python3 -m pip install lmdeploy[all]
-
-# 转模型格式
-lmdeploy convert codellama /path/of/codellama/model
-```
-
-接下来，可参考如下章节，在控制台与 codellama 进行交互式对话。
-
-**注意**:
-
-- **transformers最低要求 v4.33.0**
-- `lmdeploy.turbomind.chat` 支持把代码块拷贝到控制台，**结束输出的方式为回车，再输入"!!"，再回车**。其他非 codellama 模型，仍然是两次回车结束输入。
-
-### 代码续写
-
-```shell
-lmdeploy chat ./workspace --cap completion
-```
-
-### 代码填空
-
-```shell
-lmdeploy chat ./workspace --cap infilling
-```
-
-输入的代码块中要包含 `<FILL>`，比如：
-
-```
-def remove_non_ascii(s: str) -> str:
-    """ <FILL>
-    return result
-```
-
-`turbomind.chat` 输出的代码即是要填到 `<FILL>` 中的内容
-
-### 对话
-
-```
-lmdeploy chat ./workspace --cap chat --meta-instruction "Provide answers in Python"
-```
-
-可以把 `--meta-instruct` 的指令换成 codellama 支持的其他变成语言。
-
-### Python 专项
-
-```
-lmdeploy chat ./workspace --cap python
-```
-
-建议这里部署 Python 微调模型
-
-## 量化
-
-TBD
-
-## 服务
-
-**目前，server 支持的是对话功能**，其余功能后续再加上。
-
-启动 sever 的方式是：
-
-```shell
-# --tp: 在 tensor parallel时，使用的GPU数量
-lmdeploy serve api_server ./workspace --server-name 0.0.0.0 --server-port ${server_port} --tp 1
-```
-
-打开 `http://{server_ip}:{server_port}`，即可访问 swagger，查阅 RESTful API 的详细信息。
-
-你可以用命令行，在控制台与 server 通信：
-
-```shell
-# api_server_url 就是 api_server 产生的，比如 http://localhost:23333
-lmdeploy serve api_client api_server_url
-```
-
-或者，启动 gradio，在 webui 的聊天对话框中，与 codellama 交流：
-
-```shell
-# api_server_url 就是 api_server 产生的，比如 http://localhost:23333
-# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: lmdeploy serve gradio http://localhost:23333 --server-name localhost --server-port 6006
-lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
-```
-
-关于 RESTful API的详细介绍，请参考[这份](../llm/api_server.md)文档。
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index eba0cad89d..fa91e93a9c 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -8,7 +8,6 @@
 
 
 def pipeline(model_path: str,
-             model_name: Optional[str] = None,
              backend_config: Optional[Union[TurbomindEngineConfig,
                                             PytorchEngineConfig]] = None,
              chat_template_config: Optional[ChatTemplateConfig] = None,
@@ -29,9 +28,6 @@ def pipeline(model_path: str,
                     on huggingface.co, such as "internlm/internlm-chat-7b",
                     "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                     and so on.
-        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "internlm/internlm-chat-7b",
-            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
         backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
             config instance. Default to None.
         chat_template_config (ChatTemplateConfig): chat template configuration.
@@ -77,21 +73,11 @@ def pipeline(model_path: str,
     backend = 'pytorch' if type(
         backend_config) is PytorchEngineConfig else 'turbomind'
     logger.info(f'Using {backend} engine')
-    if 'tp' in kwargs:
-        logger.warning(
-            'The argument "tp" is deprecated and will be removed soon. '
-            'Please set "tp" in "backend_config"')
-        tp = kwargs['tp']
-        kwargs.pop('tp')
-    else:
-        tp = 1 if backend_config is None else backend_config.tp
 
     return pipeline_class(model_path,
-                          model_name=model_name,
                           backend=backend,
                           backend_config=backend_config,
                           chat_template_config=chat_template_config,
-                          tp=tp,
                           **kwargs)
 
 
@@ -123,9 +109,9 @@ def serve(model_path: str,
                     on huggingface.co, such as "internlm/internlm-chat-7b",
                     "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                     and so on.
-        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "internlm/internlm-chat-7b",
-            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+        model_name (str): the name of the served model. It can be accessed
+            by the RESTful API `/v1/models`. If it is not specified,
+            `model_path` will be adopted
         backend (str): either `turbomind` or `pytorch` backend. Default to
             `turbomind` backend.
         backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
@@ -159,11 +145,7 @@ def serve(model_path: str,
         backend_config = autoget_backend_config(model_path, backend_config)
     backend = 'pytorch' if type(
         backend_config) is PytorchEngineConfig else 'turbomind'
-    if 'tp' in kwargs:
-        tp = kwargs['tp']
-        kwargs.pop('tp')
-    else:
-        tp = 1 if backend_config is None else backend_config.tp
+
     task = Process(target=serve,
                    args=(model_path, ),
                    kwargs=dict(model_name=model_name,
@@ -172,7 +154,6 @@ def serve(model_path: str,
                                chat_template_config=chat_template_config,
                                server_name=server_name,
                                server_port=server_port,
-                               tp=tp,
                                log_level=log_level,
                                api_keys=api_keys,
                                ssl=ssl,
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 93c8af5372..64b714765a 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -4,8 +4,6 @@
 
 from transformers import AutoConfig
 
-from lmdeploy.utils import get_hf_config_content
-
 from .messages import PytorchEngineConfig, TurbomindEngineConfig
 from .utils import get_logger
 
@@ -142,8 +140,8 @@ def get_task(model_path: str):
     if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
         # workspace model
         return 'llm', AsyncEngine
-    config = get_hf_config_content(model_path)
-    if check_vl_llm(config):
+    _, config = get_model_arch(model_path)
+    if check_vl_llm(config.to_dict()):
         from lmdeploy.serve.vl_async_engine import VLAsyncEngine
         return 'vlm', VLAsyncEngine
 
@@ -165,7 +163,10 @@ def get_model_arch(model_path: str):
         config = configparser.ConfigParser()
         config.read(config_file)
         model_arch = config['llama']['model_arch']
-        return model_arch, None
+        tm_config = TurbomindEngineConfig()
+        for key in config['llama']:
+            setattr(tm_config, key, config['llama'][key])
+        return model_arch, tm_config
     else:
         # transformers model
         try:
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index e56509f29e..4a98d830d7 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -4,7 +4,7 @@
 
 from ..version import __version__
 from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
-                    convert_args, get_lora_adapters)
+                    convert_args, get_chat_template, get_lora_adapters)
 
 
 class CLI(object):
@@ -34,9 +34,10 @@ def add_parser_convert():
         parser.add_argument(
             'model_name',
             type=str,
-            help='The name of the to-be-deployed model, such as llama-7b, '
-            'llama-13b, vicuna-7b and etc. You can run `lmdeploy list` to '
-            'get the supported model names')
+            help='deprecated and unused, '
+            'it will be removed on 2024.12.31. It was originally used to '
+            'specify the name of the built-in chat template, but now it '
+            'is substituted with a clearer parameter `--chat-template`')
         parser.add_argument('model_path',
                             type=str,
                             help='The directory path of the model')
@@ -53,21 +54,18 @@ def add_parser_convert():
                             type=str,
                             default='workspace',
                             help='The destination path that saves outputs')
-        parser.add_argument(
-            '--quant-path',
-            type=str,
-            default=None,
-            help='Path of the quantized model, which can be none')
         parser.add_argument(
             '--group-size',
             type=int,
             default=0,
             help='A parameter used in awq to quantize fp16 weights '
             'to 4 bits')
-        parser.add_argument('--trust-remote-code',
-                            action='store_true',
-                            help='trust remote code from huggingface')
-
+        parser.add_argument(
+            '--chat-template',
+            type=str,
+            default=None,
+            help='the name of the built-in chat template, which can be '
+            'overviewed by `lmdeploy list`')
         parser.set_defaults(run=CLI.convert)
 
     @staticmethod
@@ -104,10 +102,7 @@ def add_parser_chat():
             ', "baichuan-inc/baichuan2-7b-chat" and so on')
         # common args
         ArgumentHelper.backend(parser)
-        ArgumentHelper.trust_remote_code(parser)
         # # chat template args
-        ArgumentHelper.meta_instruction(parser)
-        ArgumentHelper.cap(parser)
         ArgumentHelper.chat_template(parser)
         # model args
         ArgumentHelper.revision(parser)
@@ -119,7 +114,6 @@ def add_parser_chat():
 
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
-        model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
@@ -128,7 +122,6 @@ def add_parser_chat():
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
         tb_group._group_actions.append(tp_act)
-        tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(prefix_caching_act)
@@ -164,24 +157,8 @@ def list(args):
         """List the supported model names."""
         from lmdeploy.model import MODELS
         model_names = list(MODELS.module_dict.keys())
-        deprecate_names = [
-            'baichuan-7b', 'baichuan2-7b', 'chatglm2-6b', 'internlm-chat-20b',
-            'internlm-chat-7b', 'internlm-chat-7b-8k', 'internlm2-1_8b',
-            'internlm-20b', 'internlm2-20b', 'internlm2-7b', 'internlm2-chat',
-            'internlm2-chat-1_8b', 'internlm2-chat-20b', 'internlm2-chat-7b',
-            'llama-2-chat', 'llama-2', 'qwen-14b', 'qwen-7b', 'solar-70b',
-            'yi-200k', 'yi-34b', 'yi-chat', 'Mistral-7B-Instruct',
-            'Mixtral-8x7B-Instruct', 'baichuan-base', 'deepseek-chat',
-            'internlm-chat'
-        ]
-        model_names = [
-            n for n in model_names if n not in deprecate_names + ['base']
-        ]
-        deprecate_names.sort()
         model_names.sort()
-        print('The older chat template name like "internlm2-7b", "qwen-7b"'
-              ' and so on are deprecated and will be removed in the future.'
-              ' The supported chat template names are:')
+        print('The supported chat template names are:')
         print('\n'.join(model_names))
 
     @staticmethod
@@ -254,27 +231,20 @@ def get_gpu_topo():
     def chat(args):
         """Chat with pytorch or turbomind engine."""
         from lmdeploy.archs import autoget_backend
-        from lmdeploy.model import ChatTemplateConfig
+
+        chat_template_config = get_chat_template(args.chat_template)
+
         backend = args.backend
         if backend != 'pytorch':
             # set auto backend mode
             backend = autoget_backend(args.model_path)
 
-        chat_template_config = ChatTemplateConfig(
-            model_name=args.model_name,
-            meta_instruction=args.meta_instruction,
-            capability=args.cap)
-        if args.chat_template:
-            chat_template_config = ChatTemplateConfig.from_json(
-                args.chat_template)
-
         if backend == 'pytorch':
             from lmdeploy.messages import PytorchEngineConfig
             from lmdeploy.pytorch.chat import run_chat
 
             adapters = get_lora_adapters(args.adapters)
             engine_config = PytorchEngineConfig(
-                model_name=args.model_name,
                 tp=args.tp,
                 session_len=args.session_len,
                 cache_max_entry_count=args.cache_max_entry_count,
@@ -283,14 +253,11 @@ def chat(args):
             )
             run_chat(args.model_path,
                      engine_config,
-                     trust_remote_code=args.trust_remote_code,
                      chat_template_config=chat_template_config)
         else:
             from lmdeploy.turbomind.chat import main as run_chat
             kwargs = convert_args(args)
             kwargs.pop('chat_template')
-            kwargs.pop('meta_instruction')
-            kwargs.pop('trust_remote_code')
             kwargs.pop('backend')
             kwargs['chat_template_config'] = chat_template_config
             run_chat(**kwargs)
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index cea9735ade..56b0ebe07c 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmengine.config import DictAction
-
 from .cli import CLI
 from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
 
@@ -95,42 +93,6 @@ def add_parser_smooth_quant():
         ArgumentHelper.calib_search_scale(parser)
         ArgumentHelper.device(parser)
 
-    @staticmethod
-    def add_parser_kv_qparams():
-        """Add parser for kv_qparams command."""
-        parser = SubCliLite.subparsers.add_parser(
-            'kv_qparams',
-            formatter_class=DefaultsAndTypesHelpFormatter,
-            description=SubCliLite.kv_qparams.__doc__,
-            help=SubCliLite.kv_qparams.__doc__)
-        parser.set_defaults(run=SubCliLite.kv_qparams)
-
-        parser.add_argument('work_dir',
-                            type=str,
-                            help='Directory path where the stats are saved')
-        parser.add_argument('turbomind_dir',
-                            type=str,
-                            help='Directory path where to save the results')
-        parser.add_argument('--kv-bits',
-                            type=int,
-                            default=8,
-                            help='Number of bits for quantization')
-        parser.add_argument('--kv-sym',
-                            action='store_true',
-                            help='Whether to use symmetric quantizaiton')
-        parser.add_argument(
-            '--num-tp',
-            type=int,
-            default=None,
-            help='GPU number used in tensor parallelism. Should be 2^n')
-        parser.add_argument('--tm-params',
-                            nargs='*',
-                            default=None,
-                            action=DictAction,
-                            help='Used key-values pairs in xxx=yyy format'
-                            ' to update the turbomind model weights'
-                            ' config')
-
     @staticmethod
     def auto_awq(args):
         """Perform weight quantization using AWQ algorithm."""
@@ -145,13 +107,6 @@ def calibrate(args):
         kwargs = convert_args(args)
         calibrate(**kwargs)
 
-    @staticmethod
-    def kv_qparams(args):
-        """Export key and value stats."""
-        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
-        kwargs = convert_args(args)
-        run_kv_qparams(**kwargs)
-
     @staticmethod
     def smooth_quant(args):
         """Perform w8a8 quantization using SmoothQuant."""
@@ -164,5 +119,4 @@ def add_parsers():
         """Add all parsers."""
         SubCliLite.add_parser_auto_awq()
         SubCliLite.add_parser_calibrate()
-        SubCliLite.add_parser_kv_qparams()
         SubCliLite.add_parser_smooth_quant()
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index b5e2f4d39e..33ca0c36e4 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .cli import CLI
 from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
-                    convert_args, get_lora_adapters)
+                    convert_args, get_chat_template, get_lora_adapters)
 
 
 class SubCliServe:
@@ -51,16 +51,13 @@ def add_parser_gradio():
         ArgumentHelper.download_dir(parser)
 
         # chat template args
-        ArgumentHelper.meta_instruction(parser)  # TODO remove
         ArgumentHelper.chat_template(parser)
-        ArgumentHelper.cap(parser)
 
         # pytorch engine args
         pt_group = parser.add_argument_group('PyTorch engine arguments')
 
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
-        model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -71,7 +68,6 @@ def add_parser_gradio():
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
         tb_group._group_actions.append(tp_act)
-        tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
         tb_group._group_actions.append(cache_max_entry_act)
@@ -138,11 +134,10 @@ def add_parser_api_server():
         ArgumentHelper.log_level(parser)
         ArgumentHelper.api_keys(parser)
         ArgumentHelper.ssl(parser)
+        ArgumentHelper.model_name(parser)
 
         # chat template args
-        ArgumentHelper.meta_instruction(parser)  # TODO remove
         ArgumentHelper.chat_template(parser)
-        ArgumentHelper.cap(parser)
 
         # model args
         ArgumentHelper.revision(parser)
@@ -154,7 +149,6 @@ def add_parser_api_server():
         ArgumentHelper.adapters(pt_group)
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
-        model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -165,7 +159,6 @@ def add_parser_api_server():
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
         tb_group._group_actions.append(tp_act)
-        tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
         tb_group._group_actions.append(cache_max_entry_act)
@@ -206,7 +199,6 @@ def gradio(args):
         from lmdeploy.archs import autoget_backend
         from lmdeploy.messages import (PytorchEngineConfig,
                                        TurbomindEngineConfig)
-        from lmdeploy.model import ChatTemplateConfig
         from lmdeploy.serve.gradio.app import run
         backend = args.backend
 
@@ -216,7 +208,6 @@ def gradio(args):
         if backend == 'pytorch':
             backend_config = PytorchEngineConfig(
                 tp=args.tp,
-                model_name=args.model_name,
                 max_batch_size=args.max_batch_size,
                 cache_max_entry_count=args.cache_max_entry_count,
                 block_size=args.cache_block_seq_len,
@@ -225,7 +216,6 @@ def gradio(args):
             )
         else:
             backend_config = TurbomindEngineConfig(
-                model_name=args.model_name,
                 tp=args.tp,
                 max_batch_size=args.max_batch_size,
                 session_len=args.session_len,
@@ -236,13 +226,7 @@ def gradio(args):
                 cache_block_seq_len=args.cache_block_seq_len,
                 enable_prefix_caching=args.enable_prefix_caching,
             )
-        chat_template_config = ChatTemplateConfig(
-            model_name=args.model_name,
-            meta_instruction=args.meta_instruction,
-            capability=args.cap)
-        if args.chat_template:
-            chat_template_config = ChatTemplateConfig.from_json(
-                args.chat_template)
+        chat_template_config = get_chat_template(args.chat_template)
         run(args.model_path_or_server,
             server_name=args.server_name,
             server_port=args.server_port,
@@ -255,7 +239,6 @@ def gradio(args):
     def api_server(args):
         """Serve LLMs with restful api using fastapi."""
         from lmdeploy.archs import autoget_backend
-        from lmdeploy.model import ChatTemplateConfig
         from lmdeploy.serve.openai.api_server import serve as run_api_server
         backend = args.backend
         if backend != 'pytorch':
@@ -267,7 +250,6 @@ def api_server(args):
             adapters = get_lora_adapters(args.adapters)
             backend_config = PytorchEngineConfig(
                 tp=args.tp,
-                model_name=args.model_name,
                 max_batch_size=args.max_batch_size,
                 cache_max_entry_count=args.cache_max_entry_count,
                 block_size=args.cache_block_seq_len,
@@ -278,7 +260,6 @@ def api_server(args):
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(
-                model_name=args.model_name,
                 tp=args.tp,
                 max_batch_size=args.max_batch_size,
                 session_len=args.session_len,
@@ -289,10 +270,8 @@ def api_server(args):
                 cache_block_seq_len=args.cache_block_seq_len,
                 enable_prefix_caching=args.enable_prefix_caching,
             )
-        chat_template_config = None
-        if args.chat_template:
-            chat_template_config = ChatTemplateConfig.from_json(
-                args.chat_template)
+        chat_template_config = get_chat_template(args.chat_template)
+
         from lmdeploy.messages import VisionConfig
         vision_config = VisionConfig(args.vision_max_batch_size)
         run_api_server(args.model_path,
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 2ceeba79c9..468f10115e 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -61,6 +61,30 @@ def get_lora_adapters(adapters: List[str]):
     return output
 
 
+def get_chat_template(chat_template: str):
+    """get chat template config.
+
+    Args
+        chat_template(str): it could be a builtin chat template name,
+        or a chat template json file
+    """
+    import os
+
+    from lmdeploy.model import ChatTemplateConfig
+    if chat_template:
+        if os.path.isfile(chat_template):
+            return ChatTemplateConfig.from_json(chat_template)
+        else:
+            from lmdeploy.model import MODELS
+            assert chat_template in MODELS.module_dict.keys(), \
+                f"chat template '{chat_template}' is not " \
+                f'registered. The builtin chat templates are: ' \
+                f'{MODELS.module_dict.keys()}'
+            return ChatTemplateConfig(model_name=chat_template)
+    else:
+        return None
+
+
 class ArgumentHelper:
     """Helper class to add unified argument."""
 
@@ -72,10 +96,9 @@ def model_name(parser):
             '--model-name',
             type=str,
             default=None,
-            help='The name of the to-be-deployed model, such as'
-            ' llama-7b, llama-13b, vicuna-7b and etc. You '
-            'can run `lmdeploy list` to get the supported '
-            'model names')
+            help='The name of the served model. It can be accessed '
+            'by the RESTful API `/v1/models`. If it is not specified, '
+            '`model_path` will be adopted')
 
     @staticmethod
     def model_format(parser, default: str = None):
@@ -222,18 +245,6 @@ def repetition_penalty(parser):
                                    default=1.0,
                                    help='Parameter to penalize repetition')
 
-    @staticmethod
-    def cap(parser):
-        """Add argument cap to parser."""
-
-        return parser.add_argument(
-            '--cap',
-            type=str,
-            default='chat',
-            choices=['completion', 'infilling', 'chat', 'python'],
-            help='The capability of a model. '
-            'Deprecated. Please use --chat-template instead')
-
     @staticmethod
     def log_level(parser):
         """Add argument log_level to parser."""
@@ -347,17 +358,6 @@ def device(parser):
                                    choices=['cuda', 'cpu'],
                                    help='Device type of running')
 
-    @staticmethod
-    def meta_instruction(parser):
-        """Add argument meta_instruction to parser."""
-
-        return parser.add_argument(
-            '--meta-instruction',
-            type=str,
-            default=None,
-            help='System prompt for ChatTemplateConfig. Deprecated. '
-            'Please use --chat-template instead')
-
     @staticmethod
     def chat_template(parser):
         """Add chat template config to parser."""
@@ -406,15 +406,6 @@ def work_dir(parser):
             default='./work_dir',
             help='The working directory to save results')
 
-    @staticmethod
-    def trust_remote_code(parser):
-        """Add argument trust_remote_code to parser."""
-        return parser.add_argument(
-            '--trust-remote-code',
-            action='store_false',
-            default=True,
-            help='Trust remote code for loading hf models')
-
     @staticmethod
     def cache_block_seq_len(parser):
         """Add argument cache_block_seq_len to parser."""
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 46e569a372..52a71ae569 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -115,7 +115,6 @@ class TurbomindEngineConfig:
     """TurboMind Engine config.
 
     Args:
-        model_name (str): the name of the deployed model, deprecated and has no effect when version > 0.2.1
         model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq],
             `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth), awq` meaning the quantized model by AWQ.
         tp (int): the number of GPU cards used in tensor parallelism, default to 1
@@ -136,7 +135,6 @@ class TurbomindEngineConfig:
         max_prefill_iters(int): the max number of forward pass during prefill stage
     """  # noqa: E501
 
-    model_name: Optional[str] = None
     model_format: Optional[str] = None
     tp: int = 1
     session_len: Optional[int] = None
@@ -169,7 +167,6 @@ class PytorchEngineConfig:
     """PyTorch Engine Config.
 
     Args:
-        model_name (str): name of the given model.
         tp (int): Tensor Parallelism. default 1.
         session_len (int): Max session length. Default None.
         max_batch_size (int): Max batch size. Default 128.
@@ -197,7 +194,6 @@ class PytorchEngineConfig:
             It can be a branch name, a tag name, or a commit id.
             If unspecified, will use the default version.
     """
-    model_name: str = ''
     tp: int = 1
     session_len: int = None
     max_batch_size: int = 128
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index 3b3f85b8d5..a9b1390aa2 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -6,16 +6,17 @@
 
 from lmdeploy.archs import get_model_arch
 from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
-from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
+from lmdeploy.model import ChatTemplateConfig
+from lmdeploy.serve.async_engine import get_names_from_model
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
 from lmdeploy.utils import _get_and_verify_max_len
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
 
-def input_prompt(model_name):
+def input_prompt(chat_template_name):
     """Input a prompt in the consolo interface."""
-    if model_name == 'codellama':
+    if chat_template_name == 'codellama':
         print('\nenter !! to end the input >>>\n', end='')
         sentinel = '!!'
     else:
@@ -81,25 +82,19 @@ def run_chat(model_path: str,
     nth_round = 1
     step = 0
     seed = random.getrandbits(64)
-    model_name = engine_config.model_name
-    if model_name is None:
-        model_name = best_match_model(model_path)
-        assert model_name is not None, 'Can not find match model template'
-        print(f'match template: <{model_name}>')
-
-    if chat_template_config is not None:
-        if chat_template_config.model_name is None:
-            chat_template_config.model_name = model_name
-        model = chat_template_config.chat_template
-    else:
-        model = MODELS.get(model_name)()
+
+    _, chat_template_name = get_names_from_model(model_path)
+    if chat_template_config is None:
+        chat_template_config = ChatTemplateConfig(chat_template_name)
+    model = chat_template_config.chat_template
+
     stop_words = _stop_words(model.stop_words, tokenizer)
 
     _, model_config = get_model_arch(model_path)
     session_len = _get_and_verify_max_len(model_config, None)
 
     while True:
-        prompt = input_prompt(model_name)
+        prompt = input_prompt(chat_template_name)
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
@@ -138,7 +133,6 @@ def run_chat(model_path: str,
 
 
 def main(model_path: str,
-         model_name: str = None,
          session_id: int = 1,
          top_k: float = 40,
          top_p: float = 0.8,
@@ -153,7 +147,6 @@ def main(model_path: str,
 
     Args:
         model_path (str): the huggingface model path
-        model_name (str): name of the model.
         session_id (int): the identical id of a session
         top_k (int): sampling top k.
         top_p (int): sampling top p.
@@ -168,9 +161,7 @@ def main(model_path: str,
     adapters = None
     if adapter is not None:
         adapters = dict(default=adapter)
-    engine_config = PytorchEngineConfig(model_name=model_name,
-                                        tp=tp,
-                                        adapters=adapters)
+    engine_config = PytorchEngineConfig(tp=tp, adapters=adapters)
     gen_config = EngineGenerationConfig(max_new_tokens=512,
                                         top_k=top_k,
                                         top_p=top_p,
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 8bda7b477f..31e6f9fefa 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -107,11 +107,9 @@ def __init__(self,
             check_adapters(list(engine_config.adapters.values()))
 
         self.engine_config = engine_config
-        model_name = engine_config.model_name
         tp = engine_config.tp
 
         self.tp = tp
-        self.model_name = model_name
 
         self.device_context = DeviceContext(
             device_type=engine_config.device_type)
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 0dea3d17c1..039e136e43 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -21,43 +21,22 @@
 logger = get_logger('lmdeploy')
 
 
-def get_model_name_from_workspace_model(model_dir: str):
-    """Get model name from workspace model."""
+def get_names_from_model(model_path: str, model_name: str = None):
+    """Get model name and chat template name from workspace model."""
     from configparser import ConfigParser
-    triton_model_path = os.path.join(model_dir, 'triton_models', 'weights')
+    triton_model_path = os.path.join(model_path, 'triton_models', 'weights')
     if not os.path.exists(triton_model_path):
-        return None
-    ini_path = os.path.join(triton_model_path, 'config.ini')
-    # load cfg
-    with open(ini_path, 'r') as f:
-        parser = ConfigParser()
-        parser.read_file(f)
-    return parser['llama']['model_name']
-
-
-def deduce_a_name(
-        model_path: str,
-        model_name: Optional[str] = None,
-        backend_config: Optional[Union[TurbomindEngineConfig,
-                                       PytorchEngineConfig]] = None,
-        chat_template_config: Optional[ChatTemplateConfig] = None) -> str:
-    """Deduce a model name from all the possible arguments."""
-
-    def _config_model_name(config):
-        if config and config.model_name:
-            return config.model_name
-        return None
-
-    backend_config_model_name = _config_model_name(backend_config)
-    chat_template_config_model_name = _config_model_name(chat_template_config)
-    model_name = model_name or backend_config_model_name or chat_template_config_model_name  # noqa
-    if model_name is None:
-        # model maybe from workspace for turbomind
-        model_name = get_model_name_from_workspace_model(model_path)
-    # may get a model name from model_path
-    if model_name is None:
-        model_name = model_path
-    return model_name
+        chat_template_name = best_match_model(model_path)
+    else:
+        # `model_path` refers to a turbomind model, reading
+        # chat_template_name from the config
+        ini_path = os.path.join(triton_model_path, 'config.ini')
+        with open(ini_path, 'r') as f:
+            parser = ConfigParser()
+            parser.read_file(f)
+        chat_template_name = parser['llama']['chat_template']
+    model_name = model_name if model_name else model_path
+    return model_name, chat_template_name
 
 
 @dataclasses.dataclass
@@ -147,7 +126,6 @@ class AsyncEngine(LogitsMixin):
             config instance. Default to none.
         chat_template_config (ChatTemplateConfig): chat template configuration.
             Default to None.
-        tp (int): tensor parallel
     """
 
     def __init__(self,
@@ -157,39 +135,25 @@ def __init__(self,
                  backend_config: Optional[Union[TurbomindEngineConfig,
                                                 PytorchEngineConfig]] = None,
                  chat_template_config: Optional[ChatTemplateConfig] = None,
-                 tp: int = 1,
                  **kwargs) -> None:
         logger.info(
             f'input backend={backend}, backend_config={backend_config}')
         logger.info(f'input chat_template_config={chat_template_config}')
 
-        self.model_name = deduce_a_name(model_path, model_name, backend_config,
-                                        chat_template_config)
-        # build chat template config
-        if self.model_name in MODELS.module_dict.keys():
-            chat_template_name = self.model_name
-        else:
-            chat_template_name = best_match_model(model_path)
+        self.model_name, chat_template_name = get_names_from_model(
+            model_path, model_name)
         if chat_template_config is None:
             chat_template_config = ChatTemplateConfig(chat_template_name)
         elif chat_template_config.model_name is None:
             chat_template_config.model_name = chat_template_name
         self.chat_template = chat_template_config.chat_template
 
-        # prevent bc
-        for k in list(kwargs.keys()):
-            if hasattr(chat_template_config, k):
-                logger.warning(f'{k} was deprecated. Please use '
-                               'chat_template_config instead')
-                v = kwargs.pop(k)
-                setattr(chat_template_config, k, v)
         logger.info(f'updated chat_template_onfig={chat_template_config}')
 
         # build backend engine
         if backend == 'turbomind':
             self._build_turbomind(model_path=model_path,
                                   backend_config=backend_config,
-                                  tp=tp,
                                   **kwargs)
         elif backend == 'pytorch':
             self._build_pytorch(model_path=model_path,
@@ -222,12 +186,10 @@ def _build_turbomind(
             model_path: str,
             backend_config: Optional[Union[TurbomindEngineConfig,
                                            PytorchEngineConfig]] = None,
-            tp: int = 1,
             **kwargs):
         """Innter build method for turbomind backend."""
         if backend_config is None:
-            backend_config = TurbomindEngineConfig(model_name=self.model_name,
-                                                   tp=tp)
+            backend_config = TurbomindEngineConfig()
         assert isinstance(backend_config, TurbomindEngineConfig), 'Please'\
             ' use TurbomindEngineConfig imported from lmdeploy.messages for ' \
             'turbomind backend'
@@ -246,7 +208,7 @@ def _build_pytorch(
         """Innter build method for pytorch backend."""
         from lmdeploy.pytorch.engine import Engine
         if backend_config is None:
-            backend_config = PytorchEngineConfig(self.model_name)
+            backend_config = PytorchEngineConfig()
         assert isinstance(backend_config, PytorchEngineConfig), 'Please '\
             'use PytorchEngineConfig imported from lmdeploy.messages for ' \
             'pytorch backend'
@@ -258,12 +220,6 @@ def _build_pytorch(
     def __call__(self,
                  prompts: Union[List[str], str, List[Dict], List[List[Dict]]],
                  gen_config: Optional[GenerationConfig] = None,
-                 request_output_len=512,
-                 top_k: int = 40,
-                 top_p: float = 0.8,
-                 temperature: float = 0.8,
-                 repetition_penalty: float = 1.0,
-                 ignore_eos: bool = False,
                  do_preprocess: bool = True,
                  adapter_name: Optional[str] = None,
                  use_tqdm: bool = False,
@@ -276,18 +232,6 @@ def __call__(self,
                 a chat history in OpenAI format or a list of chat history.
             gen_config (GenerationConfig | None): a instance of
                 GenerationConfig. Default to None.
-            chat_template_config (ChatTemplateConfig | None):a instance of
-                ChatTemplateConfig. Default to None.
-            request_output_len (int): output token nums
-            top_k (int): The number of the highest probability vocabulary
-              tokens to keep for top-k-filtering
-            top_p (float): If set to float < 1, only the smallest set of most
-              probable tokens with probabilities that add up to top_p or higher
-            are kept for generation.
-            temperature (float): to modulate the next token probability
-            repetition_penalty (float): The parameter for repetition penalty.
-              1.0 means no penalty
-            ignore_eos (bool): indicator for ignoring eos
             do_preprocess (bool): whether pre-process the messages. Default to
                 True, which means chat_template will be applied.
             adapter_name (str): the adapter name of slora for pytorch backend.
@@ -295,13 +239,7 @@ def __call__(self,
             use_tqdm (bool): Whether use the progress bar. Default to False
         """
         if gen_config is None:
-            gen_config = GenerationConfig(
-                max_new_tokens=request_output_len,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                ignore_eos=ignore_eos)
+            gen_config = GenerationConfig()
         return self.batch_infer(prompts,
                                 gen_config=gen_config,
                                 do_preprocess=do_preprocess,
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 08f4de1640..1988b92c26 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1210,7 +1210,6 @@ def serve(model_path: str,
           chat_template_config: Optional[ChatTemplateConfig] = None,
           server_name: str = '0.0.0.0',
           server_port: int = 23333,
-          tp: int = 1,
           allow_origins: List[str] = ['*'],
           allow_credentials: bool = True,
           allow_methods: List[str] = ['*'],
@@ -1237,8 +1236,9 @@ def serve(model_path: str,
                     on huggingface.co, such as "internlm/internlm-chat-7b",
                     "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                     and so on.
-        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "InternLM/internlm-chat-7b"
+        model_name (str): the name of the served model. It can be accessed
+            by the RESTful API `/v1/models`. If it is not specified,
+            `model_path` will be adopted
         backend (str): either `turbomind` or `pytorch` backend. Default to
             `turbomind` backend.
         backend_config (TurbomindEngineConfig | PytorchEngineConfig): beckend
@@ -1280,7 +1280,7 @@ def serve(model_path: str,
         ssl_certfile = os.environ['SSL_CERTFILE']
         http_or_https = 'https'
 
-    pipeline_type, pipeline_class = get_task(model_path)
+    _, pipeline_class = get_task(model_path)
 
     VariableInterface.async_engine = pipeline_class(
         model_path=model_path,
@@ -1288,7 +1288,6 @@ def serve(model_path: str,
         backend=backend,
         backend_config=backend_config,
         chat_template_config=chat_template_config,
-        tp=tp,
         **kwargs)
 
     if qos_config_path:
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 3800cd87d0..2cb6b2c019 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,8 +4,8 @@
 
 from lmdeploy.archs import get_model_arch
 from lmdeploy.messages import EngineGenerationConfig, TurbomindEngineConfig
-from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
-from lmdeploy.serve.async_engine import deduce_a_name
+from lmdeploy.model import ChatTemplateConfig
+from lmdeploy.serve.async_engine import get_names_from_model
 from lmdeploy.tokenizer import DetokenizeState
 from lmdeploy.utils import _get_and_verify_max_len, _stop_words
 
@@ -29,7 +29,6 @@ def input_prompt(model_name):
 
 
 def main(model_path: str,
-         model_name: str = None,
          session_id: int = 1,
          top_k: float = 40,
          top_p: float = 0.8,
@@ -37,7 +36,6 @@ def main(model_path: str,
          repetition_penalty: float = 1.0,
          cap: str = 'chat',
          tp: int = 1,
-         max_batch_size: int = 1,
          model_format: str = None,
          quant_policy: int = 0,
          cache_max_entry_count: float = 0.8,
@@ -54,7 +52,6 @@ def main(model_path: str,
 
     Args:
         model_path (str): the path of the deployed model
-        model_name (str): the name of deployed model
         session_id (int): the identical id of a session
         top_k (int): sampling top k.
         top_p (int): sampling top p.
@@ -62,7 +59,6 @@ def main(model_path: str,
         repetition_penalty (float): parameter to penalize repetition
         cap (str): the capability of a model. For example, codellama has the ability among ['completion', 'infilling', 'chat', 'python']
         tp (int): GPU number used in tensor parallelism
-        max_batch_size (int): max batch size
         model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq]
         quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
         cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
@@ -77,12 +73,7 @@ def main(model_path: str,
     """ # noqa: E 501
 
     # chat template
-    model_name = deduce_a_name(model_path, model_name, None,
-                               chat_template_config)
-    if model_name in MODELS.module_dict.keys():
-        chat_template_name = model_name
-    else:
-        chat_template_name = best_match_model(model_path)
+    _, chat_template_name = get_names_from_model(model_path)
     if chat_template_config is None:
         chat_template_config = ChatTemplateConfig(chat_template_name)
     elif chat_template_config.model_name is None:
@@ -93,12 +84,11 @@ def main(model_path: str,
     model = chat_template_config.chat_template
 
     _, model_config = get_model_arch(model_path)
-    session_len = _get_and_verify_max_len(model_config, None)
+    session_len = _get_and_verify_max_len(model_config, session_len)
 
     # engine
     engine_cfg = TurbomindEngineConfig(
-        max_batch_size=max_batch_size,
-        model_name=model_name,
+        max_batch_size=1,
         model_format=model_format,
         session_len=session_len,
         cache_max_entry_count=cache_max_entry_count,
@@ -131,7 +121,7 @@ def main(model_path: str,
     step = 0
     seed = random.getrandbits(64)
     while True:
-        prompt = input_prompt(model_name)
+        prompt = input_prompt(chat_template_name)
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 6c93d39c42..59a038da97 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -7,8 +7,8 @@
 import torch
 
 from lmdeploy.archs import get_model_arch
-from lmdeploy.model import MODELS
-from lmdeploy.utils import get_model
+from lmdeploy.model import MODELS, best_match_model
+from lmdeploy.utils import get_logger, get_model
 
 from ...utils import _get_and_verify_max_len
 from ..supported_models import SUPPORTED_ARCHS, is_supported
@@ -16,6 +16,7 @@
 from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
 
 SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', None]
+logger = get_logger('lmdeploy')
 
 
 def get_input_model_registered_name(model_path: str, model_format: str):
@@ -162,13 +163,45 @@ def pack_model_repository(workspace_path: str):
                dst=osp.join(model_repo_dir, 'postprocessing'))
 
 
+def get_tm_model(model_path,
+                 model_name,
+                 chat_template_name,
+                 model_format,
+                 group_size,
+                 tp,
+                 out_dir: str = None):
+    # TODO: open the following condition check in another PR,
+    # CLI needs to be updated
+    # if model_format == 'awq' and group_size <= 0:
+    #     raise RuntimeError(
+    #         'group_size should be specified when the model is awq')
+
+    input_model_name = get_input_model_registered_name(model_path,
+                                                       model_format)
+    input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
+                                                     tokenizer_path=model_path)
+
+    output_model_name, cfg = get_output_model_registered_name_and_config(
+        model_path=model_path,
+        model_format=model_format,
+        group_size=group_size)
+
+    cfg.chat_template = chat_template_name
+    cfg.tensor_para_size = tp
+
+    output_model = OUTPUT_MODELS.get(output_model_name)(
+        input_model=input_model, cfg=cfg, out_dir=out_dir)
+
+    return output_model
+
+
 def main(model_name: str,
          model_path: str,
          model_format: str = None,
+         chat_template: str = None,
          tokenizer_path: str = None,
          dst_path: str = 'workspace',
          tp: int = 1,
-         quant_path: str = None,
          group_size: int = 0,
          revision: str = None,
          download_dir: str = None,
@@ -176,14 +209,14 @@ def main(model_name: str,
     """deploy llama family models via turbomind.
 
     Args:
-        model_name (str): the name of the to-be-deployed model, such as
-            llama-7b, llama-13b, vicuna-7b and etc
+        model_name (str): unused any longer
         model_path (str): the directory path of the model
         model_format (str): the format of the model, should choose from
             ['meta_llama', 'hf', 'awq', None]. 'meta_llama' stands for META's
             llama format, 'hf' means huggingface llama format, and 'awq' means
             llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
-            the default value is None
+            The default value is None
+        chat_template (str): the name of the built-in chat template.
         tokenizer_path (str): the path of tokenizer model
         dst_path (str): the destination path that saves outputs
         tp (int): the number of GPUs used for tensor parallelism, should be 2^n
@@ -197,11 +230,17 @@ def main(model_name: str,
             default to the default cache directory of huggingface.
         kwargs (dict): other params for convert
     """
-
-    assert model_name in MODELS.module_dict.keys(), \
-        f"'{model_name}' is not supported. " \
-        f'The supported models are: {MODELS.module_dict.keys()}'
-
+    if model_name:
+        logger.warning(
+            'The argument `<model_name>` is deprecated and unused now. '
+            'It will be removed on 2024.12.31. It was originally used to '
+            'specify the name of the built-in chat template, but now it '
+            'is substituted with a clearer parameter `--chat-template`')
+    if chat_template is None:
+        chat_template = best_match_model(model_path)
+    assert chat_template in MODELS.module_dict.keys(), \
+        f"chat template '{chat_template}' is not a built-in template. " \
+        f'The built-ins are: {MODELS.module_dict.keys()}'
     assert is_supported(model_path), (
         f'turbomind does not support {model_path}. '
         'Plz try pytorch engine instead.')
@@ -221,39 +260,12 @@ def main(model_name: str,
         )
         print(f'load model from {model_path}')
 
-    input_model_name = get_input_model_registered_name(model_path,
-                                                       model_format)
-    print(f'input_model_registered_name : {input_model_name}')
-    register_names = list(INPUT_MODELS.module_dict.keys())
-    if input_model_name not in register_names:
-        print(
-            f'Failed to find the entry in INPUT_MODELS registry with name'
-            f'"{input_model_name}". The registered names are {register_names}')
-        exit(-1)
-
-    output_model_name, cfg = get_output_model_registered_name_and_config(
-        model_path, model_format, group_size)
-    print(f'output_model_registered_name: {output_model_name}')
-    register_names = list(OUTPUT_MODELS.module_dict.keys())
-    if output_model_name not in register_names:
-        exit(-1)
-
-    cfg.model_name = model_name
-    cfg.tensor_para_size = tp
-
     tm_weight_path, tm_tokenizer_path = create_workspace(dst_path)
-
     copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path)
 
-    input_model = INPUT_MODELS.get(input_model_name)(
-        model_path=model_path,
-        tokenizer_path=tokenizer_path,
-        ckpt_path=quant_path)
-    output_model = OUTPUT_MODELS.get(output_model_name)(
-        input_model=input_model, cfg=cfg, to_file=True, out_dir=tm_weight_path)
-    print(f'turbomind model config: {output_model.cfg}')
-
-    output_model.export()
+    tm_model = get_tm_model(model_path, model_name, chat_template,
+                            model_format, group_size, tp, tm_weight_path)
+    tm_model.export()
 
 
 if __name__ == '__main__':
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 7b1ce3cc8e..ef1473bbe6 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -36,6 +36,7 @@ class TurbomindModelConfig:
     """Config for turbomind model."""
 
     model_name: str = ''
+    chat_template: str = ''
     model_arch: str = None
     tensor_para_size: int = None
     head_num: int = None
@@ -163,7 +164,6 @@ class BaseOutputModel(ABC):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 to_file: bool = True,
                  out_dir: str = ''):
         super().__init__()
         self.input_model = input_model
@@ -171,8 +171,8 @@ def __init__(self,
         if not cfg.valid:
             self.cfg = self.get_config(cfg)
         assert self.cfg.valid
-        self.to_file = to_file
         self.out_dir = out_dir
+        self.to_file = True if out_dir else False
         self.tm_params = {}
         model_info = self.input_model.model_info()
         self.permute_qk = model_info.get('permute_qk', True)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
index 1aba38af8d..981a443c33 100644
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -21,9 +21,8 @@ class TurbomindModel(BaseOutputModel):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 to_file: bool = True,
                  out_dir: str = ''):
-        super().__init__(input_model, cfg, to_file, out_dir)
+        super().__init__(input_model, cfg, out_dir)
 
     def get_config(self, cfg: TurbomindModelConfig):
         """Get turbomind config."""
diff --git a/lmdeploy/turbomind/deploy/target_model/plora.py b/lmdeploy/turbomind/deploy/target_model/plora.py
index 5d6f9314f3..38edbb9a29 100644
--- a/lmdeploy/turbomind/deploy/target_model/plora.py
+++ b/lmdeploy/turbomind/deploy/target_model/plora.py
@@ -21,9 +21,8 @@ class TurbomindPloraModel(BaseOutputModel):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 to_file: bool = True,
                  out_dir: str = ''):
-        super().__init__(input_model, cfg, to_file, out_dir)
+        super().__init__(input_model, cfg, out_dir)
 
     def get_config(self, cfg: TurbomindModelConfig):
         """Get turbomind config."""
diff --git a/lmdeploy/turbomind/deploy/target_model/plora_w4.py b/lmdeploy/turbomind/deploy/target_model/plora_w4.py
index 55f8bf48d0..0adf437e28 100644
--- a/lmdeploy/turbomind/deploy/target_model/plora_w4.py
+++ b/lmdeploy/turbomind/deploy/target_model/plora_w4.py
@@ -12,9 +12,8 @@ class TurbomindPloraW4Model(TurbomindPloraModel):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 to_file: bool = True,
                  out_dir: str = ''):
-        super().__init__(input_model, cfg, to_file, out_dir)
+        super().__init__(input_model, cfg, out_dir)
 
     def get_config(self, cfg: TurbomindModelConfig):
         """Get turbomind config."""
diff --git a/lmdeploy/turbomind/deploy/target_model/w4.py b/lmdeploy/turbomind/deploy/target_model/w4.py
index 5cb944a2c8..555ca69398 100644
--- a/lmdeploy/turbomind/deploy/target_model/w4.py
+++ b/lmdeploy/turbomind/deploy/target_model/w4.py
@@ -75,9 +75,8 @@ class TurbomindW4Model(BaseOutputModel):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
-                 to_file: bool = True,
                  out_dir: str = ''):
-        super().__init__(input_model, cfg, to_file, out_dir)
+        super().__init__(input_model, cfg, out_dir)
 
     def get_config(self, cfg: TurbomindModelConfig):
         """Get turbomind config."""
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 4ae170cdac..718cd49d10 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -5,7 +5,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from configparser import ConfigParser
 from queue import LifoQueue, Queue
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Union
 
 import numpy as np
 import torch
@@ -14,15 +14,12 @@
 import lmdeploy
 from lmdeploy.messages import (EngineGenerationConfig, EngineOutput,
                                ResponseType, TurbomindEngineConfig)
-from lmdeploy.model import best_match_model
 from lmdeploy.tokenizer import Tokenizer
-from lmdeploy.utils import get_hf_config_content, get_logger, get_model
+from lmdeploy.utils import get_logger, get_model
 
-from .deploy.converter import (SUPPORTED_FORMATS,
-                               get_input_model_registered_name,
-                               get_output_model_registered_name_and_config)
-from .deploy.source_model.base import INPUT_MODELS
-from .deploy.target_model.base import OUTPUT_MODELS, TurbomindModelConfig
+from ..archs import get_model_arch
+from .deploy.converter import SUPPORTED_FORMATS, get_tm_model
+from .deploy.target_model.base import TurbomindModelConfig
 from .supported_models import is_supported
 from .utils import ModelSource, get_model_source
 
@@ -64,28 +61,12 @@ def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
     return ret
 
 
-def _update_engine_config(config: TurbomindEngineConfig, **kwargs):
-    if config is None:
-        config = TurbomindEngineConfig()
-    for k, v in kwargs.items():
-        if v and hasattr(config, k):
-            setattr(config, k, v)
-            logger.warning(f'kwargs {k} is deprecated to initialize model, '
-                           'use TurbomindEngineConfig instead.')
-    if config.model_name is not None:
-        logger.warning('model_name is deprecated in TurbomindEngineConfig '
-                       'and has no effect')
-    return config
-
-
 class TurboMind:
     """LMDeploy's inference engine.
 
     Args:
         model_path (str): the path of turbomind's model
         model_source (int): model source
-        model_name (str): needed when model_path is a hf model and not
-            managed by lmdeploy
         model_format (str): needed when model_path is a hf model and not
             managed by lmdeploy
         group_size (int): needed when model_path is a hf model and not
@@ -95,38 +76,15 @@ class TurboMind:
 
     def __init__(self,
                  model_path: str,
+                 model_name: str = None,
+                 chat_template_name: str = None,
                  engine_config: TurbomindEngineConfig = None,
                  model_source: ModelSource = ModelSource.WORKSPACE,
-                 model_name: Optional[str] = None,
-                 model_format: Optional[str] = None,
-                 group_size: Optional[int] = None,
-                 tp: Optional[int] = None,
                  **kwargs):
-        # if loading from workspace and engine_config is None, use config.ini
-        # and ignore passed args like model_format, tp, etc.
-        if model_source == ModelSource.WORKSPACE and engine_config is None:
-
-            def _catch_args(**kwargs):
-                args = []
-                for k, v in kwargs.items():
-                    if v and hasattr(TurbomindEngineConfig, k):
-                        args.append(k)
-                return args
-
-            args = _catch_args(**kwargs, model_format=model_format, tp=tp)
-            if len(args) > 0:
-                logger.warning(
-                    f'loading from workspace, ignore args {args} '
-                    'please use TurbomindEngineConfig or modify config.ini')
-
-        else:
-            engine_config = _update_engine_config(engine_config,
-                                                  model_format=model_format,
-                                                  group_size=group_size,
-                                                  tp=tp,
-                                                  **kwargs)
+        self.model_name = model_name
+        self.chat_template_name = chat_template_name
 
-        tp = engine_config.tp if engine_config is not None else 1
+        tp = 1 if engine_config is None else engine_config.tp
         assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
         self.gpu_count = tp
 
@@ -197,17 +155,16 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
         """Load model which is in hf format."""
         assert model_source == ModelSource.HF_MODEL, \
             f'{model_source} is not supported'
+        if engine_config is None:
+            logger.warning('input engine config is None, using the default')
+            engine_config = TurbomindEngineConfig()
         assert engine_config.model_format in SUPPORTED_FORMATS, \
             f'The model format should be in {SUPPORTED_FORMATS}'
 
-        # update model_format if not supplied and outputs_stats.pth exists
-        if osp.exists(osp.join(model_path, 'outputs_stats.pth')) and \
-                engine_config.model_format is None:
-            engine_config.model_format = 'awq'
-
+        group_size = 0
         if engine_config.model_format is None:
-            cfg = get_hf_config_content(model_path)
-            quant_config = cfg.get('quantization_config')
+            _, cfg = get_model_arch(model_path)
+            quant_config = getattr(cfg, 'quantization_config', None)
             if quant_config:
                 quant_method = quant_config.get('quant_method')
                 group_size = int(quant_config.get('group_size', 0))
@@ -221,24 +178,12 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
             'Plz try pytorch engine instead.')
 
         # convert transformers model into turbomind model format
-        match_name = best_match_model(model_path)
-        input_model_name = get_input_model_registered_name(
-            model_path, engine_config.model_format)
-        input_model = INPUT_MODELS.get(input_model_name)(
-            model_path=model_path, tokenizer_path=model_path, ckpt_path=None)
-
-        output_model_name, cfg = get_output_model_registered_name_and_config(
-            model_path=model_path,
-            model_format=engine_config.model_format,
-            group_size=0)
-        cfg.update_from_engine_config(engine_config)
-        output_model = OUTPUT_MODELS.get(output_model_name)(
-            input_model=input_model, cfg=cfg, to_file=False, out_dir='')
-
-        self.config = output_model.cfg
-        self.config.model_name = match_name \
-            if match_name is not None else 'base'
-        self.model_name = self.config.model_name
+        tm_model = get_tm_model(model_path, self.model_name,
+                                self.chat_template_name,
+                                engine_config.model_format, group_size,
+                                engine_config.tp)
+
+        self.config = tm_model.cfg
         logger.info(f'model_config:\n\n{self.config.toini()}')
 
         model_comm = _tm.AbstractTransformerModel.create_llama_model(
@@ -251,10 +196,10 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
         self._create_weight(model_comm)
 
         # copy hf model weight to turbomind weight
-        tm_params = output_model.tm_params
+        tm_params = tm_model.tm_params
         self._get_model_params(model_comm, tm_params)
         logger.warning(f'get {len(tm_params)} model params')
-        output_model.export()
+        tm_model.export()
         # there should be no left turbomind params.
         if len(tm_params) > 0:
             uninitialized = list(tm_params.keys())
@@ -287,10 +232,12 @@ def _from_workspace(self, model_path: str,
         if engine_config is not None:
             engine_config.tp = cfg.tensor_para_size
             cfg.update_from_engine_config(engine_config)
-
-        # update cls
+        if self.model_name:
+            cfg.model_name = self.model_name
+        if self.chat_template_name:
+            cfg.chat_template_name = self.chat_template_name
+        # update cfg
         self.config = cfg
-        self.model_name = cfg.model_name
 
         # create model
         logger.warning(f'model_config:\n\n{cfg.toini()}')
@@ -308,10 +255,9 @@ def _from_workspace(self, model_path: str,
     @classmethod
     def from_pretrained(cls,
                         pretrained_model_name_or_path: str,
+                        model_name: str = None,
+                        chat_template_name: str = None,
                         engine_config: TurbomindEngineConfig = None,
-                        model_format: Optional[str] = None,
-                        group_size: Optional[int] = None,
-                        tp: Optional[int] = None,
                         **kwargs):
         """LMDeploy's turbomind inference engine.
 
@@ -329,20 +275,16 @@ def from_pretrained(cls,
                       on huggingface.co, such as "internlm/internlm-chat-7b",
                       "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                       and so on.
-            model_format (str): model format
-            group_size (int): group size
-            tp (int): tensor parallel size
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update configuration when initialize the engine.
         """
         model_source = get_model_source(pretrained_model_name_or_path)
         logger.info(f'model_source: {model_source}')
         return cls(model_path=pretrained_model_name_or_path,
+                   model_name=model_name,
+                   chat_template_name=chat_template_name,
                    engine_config=engine_config,
                    model_source=model_source,
-                   model_format=model_format,
-                   group_size=group_size,
-                   tp=tp,
                    **kwargs)
 
     def create_instance(self, cuda_stream_id=0):
@@ -432,24 +374,6 @@ def _func(device_id, enque_output):
             f = self.executor.submit(_func, device_id, device_id == 0)
             self.futures[device_id] = f
 
-    def _update_generation_config(self, config: EngineGenerationConfig,
-                                  **kwargs: dict):
-        if config is None:
-            config = EngineGenerationConfig()
-
-        deprecated_kwargs = []
-        for k, v in kwargs.items():
-            if k in config.__dict__:
-                config.__dict__[k] = v
-                deprecated_kwargs.append(k)
-        if 'request_output_len' in kwargs:
-            config.max_new_tokens = kwargs['request_output_len']
-            deprecated_kwargs.append('request_output_len')
-        for k in deprecated_kwargs:
-            logger.warning(f'kwargs {k} is deprecated for inference, '
-                           'use GenerationConfig instead.')
-        return config
-
     def _get_logprobs(self,
                       logprob_vals: torch.Tensor,
                       logprob_indexes: torch.Tensor,
@@ -487,11 +411,12 @@ def end(self, session_id: int):
         """End the given session."""
         input_ids = [self.tm_model.tokenizer.eos_token_id]
         end_generator = self.tm_model.create_instance()
-        for outputs in end_generator.stream_infer(session_id,
-                                                  input_ids,
-                                                  request_output_len=0,
-                                                  sequence_start=False,
-                                                  sequence_end=True):
+        for outputs in end_generator.stream_infer(
+                session_id,
+                input_ids,
+                sequence_start=False,
+                sequence_end=True,
+                gen_config=EngineGenerationConfig(max_new_tokens=0)):
             pass
 
     async def async_end(self, session_id: int):
@@ -502,12 +427,13 @@ def cancel(self, session_id: int):
         """Stop current streaming inference."""
         input_ids = [self.tm_model.tokenizer.eos_token_id]
         stop_generator = self.tm_model.create_instance()
-        for outputs in stop_generator.stream_infer(session_id,
-                                                   input_ids,
-                                                   request_output_len=0,
-                                                   sequence_start=False,
-                                                   sequence_end=False,
-                                                   stop=True):
+        for outputs in stop_generator.stream_infer(
+                session_id,
+                input_ids,
+                sequence_start=False,
+                sequence_end=False,
+                stop=True,
+                gen_config=EngineGenerationConfig(max_new_tokens=0)):
             pass
 
     async def async_cancel(self, session_id: int):
@@ -694,7 +620,6 @@ async def async_stream_infer(self,
             logger.info(f'Register stream callback for {session_id}')
             self.model_insts[0].register_callback(_forward_callback)
 
-        gen_config = self._update_generation_config(gen_config, **kwargs)
         inputs, input_lengths = self.prepare_inputs(
             session_id=session_id,
             input_ids=input_ids,
@@ -808,7 +733,6 @@ def stream_infer(self,
             logger.info(f'Register stream callback for {session_id}')
             self.model_insts[0].register_callback(self._forward_callback)
 
-        gen_config = self._update_generation_config(gen_config, **kwargs)
         inputs, input_lengths = self.prepare_inputs(
             session_id=session_id,
             input_ids=input_ids,
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index 856f8efbe4..de1bf04efb 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -1,16 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 import functools
-import json
 import logging
-import os
 import sys
 import time
 from contextlib import contextmanager
 from logging import Logger, LogRecord
 from typing import List, Optional, TypeVar, Union
 
-from huggingface_hub import hf_hub_download
 from transformers import PretrainedConfig
 
 logger_initialized = {}
@@ -183,20 +180,6 @@ def _stop_words(stop_words: List[Union[int, str]], tokenizer: object):
     return stop_words
 
 
-def get_hf_config_content(pretrained_model_name_or_path: str,
-                          **kwargs) -> dict:
-    """Get config content of a hf model."""
-    if os.path.exists(pretrained_model_name_or_path):
-        config_path = os.path.join(pretrained_model_name_or_path,
-                                   'config.json')
-    else:
-        config_path = hf_hub_download(pretrained_model_name_or_path,
-                                      'config.json')
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-    return config
-
-
 def get_model(pretrained_model_name_or_path: str,
               download_dir: str = None,
               revision: str = None,
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index 8c0bed8ac5..34f367a247 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -5,8 +5,9 @@
 import PIL
 import PIL.Image
 
+from lmdeploy.archs import get_model_arch
 from lmdeploy.model import BaseModel
-from lmdeploy.utils import get_hf_config_content, get_logger
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.constants import IMAGE_TOKEN
 from lmdeploy.vl.utils import encode_image_base64, load_image
 
@@ -310,8 +311,7 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
     if model_name == 'yi-vl':
         return YiVLChatTemplateWrapper(chat_template)
 
-    config = get_hf_config_content(model_path)
-    arch = config['architectures'][0]
+    arch, _ = get_model_arch(model_path)
     if arch == 'QWenLMHeadModel':
         return QwenVLChatTemplateWrapper(chat_template)
     elif arch in [
diff --git a/tests/test_lmdeploy/test_async_engine.py b/tests/test_lmdeploy/test_async_engine.py
index e8ec119fad..872b6b1abc 100644
--- a/tests/test_lmdeploy/test_async_engine.py
+++ b/tests/test_lmdeploy/test_async_engine.py
@@ -1,26 +1,35 @@
-import pytest
-
-from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
-from lmdeploy.model import ChatTemplateConfig
-from lmdeploy.serve.async_engine import deduce_a_name
-
-
-@pytest.mark.parametrize(
-    'backend_config',
-    [TurbomindEngineConfig('internlm'),
-     PytorchEngineConfig(None), None])
-@pytest.mark.parametrize(
-    'chat_template_config',
-    [ChatTemplateConfig('internlm'),
-     ChatTemplateConfig(None), None])
-@pytest.mark.parametrize('model_name', ['internlm', None])
-@pytest.mark.parametrize('model_path', ['internlm/internlm2-chat-7b'])
-def test_deduce_a_name(model_path, model_name, chat_template_config,
-                       backend_config):
-    name = deduce_a_name(model_path, model_name, chat_template_config,
-                         backend_config)
-    if model_name or getattr(backend_config, 'model_name', None) or getattr(
-            chat_template_config, 'model_name', None):
-        assert name == 'internlm'
-    else:
-        assert name == model_path
+import configparser
+import os
+import tempfile
+
+from lmdeploy.serve.async_engine import get_names_from_model
+
+
+def test_get_names_from_hf_model():
+    cases = [
+        # model repo_id from huggingface hub, model_name, chat_template_name
+        ('InternLM/internlm2_5-7b-chat', 'internlm2.5-7b-chat', 'internlm2'),
+        ('InternLM/internlm2_5-7b-chat', None, 'internlm2'),
+    ]
+    for model_path, model_name, chat_template in cases:
+        _model_name, _chat_template = get_names_from_model(
+            model_path=model_path, model_name=model_name)
+        assert _chat_template == chat_template
+        assert _model_name == model_name if model_name else model_path
+
+
+def test_get_names_from_turbomind_model():
+    workspace = tempfile.TemporaryDirectory('internlm2_5-7b-chat').name
+    os.makedirs(os.path.join(workspace, 'triton_models', 'weights'),
+                exist_ok=True)
+
+    expected_chat_template = 'internlm2'
+    config = configparser.ConfigParser()
+    config.add_section('llama')
+    config.set('llama', 'chat_template', expected_chat_template)
+
+    with open(f'{workspace}/triton_models/weights/config.ini', 'w') as f:
+        config.write(f)
+
+    _, chat_template = get_names_from_model(workspace)
+    assert chat_template == expected_chat_template

From 687f24264ed80095cf6edd3fbfd887193c0525f1 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:48:25 +0800
Subject: [PATCH 06/39] Add user guide about slora serving  (#2084)

* Add slora serving document, add nest_asyncio to requirements

* recover requirements

* titles
---
 docs/en/index.rst                 |  1 +
 docs/en/llm/api_server_lora.md    | 96 +++++++++++++++++++++++++++++++
 docs/zh_cn/index.rst              |  1 +
 docs/zh_cn/llm/api_server_lora.md | 96 +++++++++++++++++++++++++++++++
 4 files changed, 194 insertions(+)
 create mode 100644 docs/en/llm/api_server_lora.md
 create mode 100644 docs/zh_cn/llm/api_server_lora.md

diff --git a/docs/en/index.rst b/docs/en/index.rst
index 3842b54f08..5f5f3420dc 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -59,6 +59,7 @@ Documentation
    llm/pipeline.md
    llm/api_server.md
    llm/api_server_tools.md
+   llm/api_server_lora.md
    llm/gradio.md
    llm/proxy_server.md
 
diff --git a/docs/en/llm/api_server_lora.md b/docs/en/llm/api_server_lora.md
new file mode 100644
index 0000000000..fa685f4990
--- /dev/null
+++ b/docs/en/llm/api_server_lora.md
@@ -0,0 +1,96 @@
+# Serving LoRA
+
+## Launch S-LoRA
+
+S-LoRA is currently only supported by the PyTorch backend. Its deployment process is similar to that of other models, and you can view the commands using lmdeploy `serve api_server -h`. Among the parameters supported by the PyTorch backend, there are configuration options for S-LoRA.
+
+```txt
+PyTorch engine arguments:
+  --adapters [ADAPTERS [ADAPTERS ...]]
+                        Used to set path(s) of lora adapter(s). One can input key-value pairs in xxx=yyy format for multiple lora adapters. If only have one adapter, one can only input the path of the adapter.. Default:
+                        None. Type: str
+```
+
+The user only needs to pass the Hugging Face model path of the LoRA weights in the form of a dictionary to `--adapters`.
+
+```shell
+lmdeploy serve api_server THUDM/chatglm2-6b --adapters mylora=chenchi/lora-chatglm2-6b-guodegang
+```
+
+After the service starts, you can find two available model names in the Swagger UI: ‘THUDM/chatglm2-6b’ and ‘mylora’. The latter is the key in the `--adapters` dictionary.
+
+## Client usage
+
+### CLI
+
+When using the OpenAI endpoint, the `model` parameter can be used to select either the base model or a specific LoRA weight for inference. The following example chooses to use the provided `chenchi/lora-chatglm2-6b-guodegang` for inference.
+
+```shell
+curl -X 'POST' \
+  'http://localhost:23334/v1/chat/completions' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "mylora",
+  "messages": [
+    {
+      "content": "hi",
+      "role": "user"
+    }
+  ]
+}'
+```
+
+And here is the output:
+
+```json
+{
+  "id": "2",
+  "object": "chat.completion",
+  "created": 1721377275,
+  "model": "mylora",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": " 很高兴哪有什么赶凳儿？（按东北语说的“起早哇”），哦，东北人都学会外语了？",
+        "tool_calls": null
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 17,
+    "total_tokens": 43,
+    "completion_tokens": 26
+  }
+}
+```
+
+### python
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = 'mylora'
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "user", "content": "hi"},
+  ],
+    temperature=0.8,
+    top_p=0.8
+)
+print(response)
+```
+
+The printed response content is:
+
+```txt
+ChatCompletion(id='4', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=' 很高兴能够见到你哪，我也在辐射区开了个愣儿，你呢，还活着。', role='assistant', function_call=None, tool_calls=None))], created=1721377497, model='mylora', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=17, total_tokens=39))
+```
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 8691c423b0..262f970ce0 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -59,6 +59,7 @@ LMDeploy 工具箱提供以下核心功能：
    llm/pipeline.md
    llm/api_server.md
    llm/api_server_tools.md
+   llm/api_server_lora.md
    llm/gradio.md
    llm/proxy_server.md
 
diff --git a/docs/zh_cn/llm/api_server_lora.md b/docs/zh_cn/llm/api_server_lora.md
new file mode 100644
index 0000000000..b6f40d941f
--- /dev/null
+++ b/docs/zh_cn/llm/api_server_lora.md
@@ -0,0 +1,96 @@
+# LoRA 推理服务
+
+## 启动 S-LoRA 服务
+
+S-LoRA 目前只有 pytorch 后端支持。它的服务化，和其他模型服务化一样，命令都可以用 `lmdeploy serve api_server -h` 查看。其中 pytorch 后端支持的参数就有 S-LoRA 的配置内容。
+
+```txt
+PyTorch engine arguments:
+  --adapters [ADAPTERS [ADAPTERS ...]]
+                        Used to set path(s) of lora adapter(s). One can input key-value pairs in xxx=yyy format for multiple lora adapters. If only have one adapter, one can only input the path of the adapter.. Default:
+                        None. Type: str
+```
+
+用户只需要将 lora 权重的 huggingface 模型路径通过字典的形式传入 `--adapters` 即可。
+
+```shell
+lmdeploy serve api_server THUDM/chatglm2-6b --adapters mylora=chenchi/lora-chatglm2-6b-guodegang
+```
+
+服务启动后，可以在 Swagger UI 中查询到两个可用的模型名字：“THUDM/chatglm2-6b” 和 “mylora”。后者是 `--adapters` 字典的 key。
+
+## 客户端使用
+
+### CLI
+
+使用时，OpenAI 接口参数 `model` 可以用来选择使用基础模型还是某个 lora 权重用于推理。下面的例子就选择使用了传入的 `chenchi/lora-chatglm2-6b-guodegang` 用于推理。
+
+```shell
+curl -X 'POST' \
+  'http://localhost:23334/v1/chat/completions' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "model": "mylora",
+  "messages": [
+    {
+      "content": "hi",
+      "role": "user"
+    }
+  ]
+}'
+```
+
+可以得到一个这个 lora 权重特有的回复：
+
+```json
+{
+  "id": "2",
+  "object": "chat.completion",
+  "created": 1721377275,
+  "model": "mylora",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": " 很高兴哪有什么赶凳儿？（按东北语说的“起早哇”），哦，东北人都学会外语了？",
+        "tool_calls": null
+      },
+      "logprobs": null,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 17,
+    "total_tokens": 43,
+    "completion_tokens": 26
+  }
+}
+```
+
+### python
+
+```python
+from openai import OpenAI
+client = OpenAI(
+    api_key='YOUR_API_KEY',
+    base_url="http://0.0.0.0:23333/v1"
+)
+model_name = 'mylora'
+response = client.chat.completions.create(
+  model=model_name,
+  messages=[
+    {"role": "user", "content": "hi"},
+  ],
+    temperature=0.8,
+    top_p=0.8
+)
+print(response)
+```
+
+打印的响应内容为：
+
+```txt
+ChatCompletion(id='4', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=' 很高兴能够见到你哪，我也在辐射区开了个愣儿，你呢，还活着。', role='assistant', function_call=None, tool_calls=None))], created=1721377497, model='mylora', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=17, total_tokens=39))
+```

From c685f777b397eb3d9fa3b2c0ac6dba2198586961 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Thu, 8 Aug 2024 14:51:46 +0800
Subject: [PATCH 07/39] Fix duplicated session_id when pipeline is used by
 multithreads (#2134)

* add session_ids arg for multithread use of pipeline.stream_infer

* Revert "disable peer access code (#2082)"

This reverts commit 263e8cfbced7d8261a1f66223ade9427af795eba.

* Revert "Revert "disable peer access code (#2082)""

This reverts commit 2b74d4623d7ddcc13af8fc2a231098c50cfc42cd.

* update

* add peer allocator

* fix lint

* check cuda error

* fix comments

* fix wrong allocator

---------

Co-authored-by: Li Zhang <lzhang329@gmail.com>
---
 lmdeploy/messages.py           |  6 ++++--
 lmdeploy/serve/async_engine.py | 23 +++++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 52a71ae569..a8d22bad10 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -249,8 +249,7 @@ class Response:
         generate_token_len (int): the response token length.
         input_token_len (int): the input prompt token length. Note that it may
             contains chat template part.
-        session_id (int): the id for running the session. Basically, it refers
-            to the position index of the input request batch.
+        session_id (int): the id for running the session.
         finish_reason ('stop' | 'length' | None): the reason the model stopped
             generating tokens. This will be 'stop' if the model hit a natural
             stop point or a provided stop sequence, 'length' if the maximum
@@ -258,6 +257,8 @@ class Response:
         token_ids: (List[int]): the output token ids.
         logprobs: (List[Dict[int, float]]): the top logprobs for each output
             position.
+        index (int): it refers to the position index of the input request
+            batch
     """
     text: str
     generate_token_len: int
@@ -266,6 +267,7 @@ class Response:
     finish_reason: Optional[Literal['stop', 'length']] = None
     token_ids: List[int] = field(default_factory=list)
     logprobs: List[Dict[int, float]] = None
+    index: int = 0
 
 
 @dataclass
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 039e136e43..f9c7d969dc 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -180,6 +180,7 @@ def __init__(self,
         self.gens_set = set()
         for i in range(self.instance_num):
             self.gens_set.add(self.engine.create_instance())
+        self._session_id = count(0)
 
     def _build_turbomind(
             self,
@@ -328,7 +329,11 @@ def batch_infer(
         assert len(prompts) == len(gen_config),\
                 'input gen_confg length differs from the length of prompts' # noqa
         prompt_num = len(prompts)
-        outputs = [Response('', 0, 0, i) for i in range(prompt_num)]
+        session_ids = [next(self._session_id) for _ in range(prompt_num)]
+        outputs = [
+            Response('', 0, 0, session_ids[i], index=i)
+            for i in range(prompt_num)
+        ]
         generators = []
         if use_tqdm:
             import tqdm
@@ -336,7 +341,7 @@ def batch_infer(
         for i, prompt in enumerate(prompts):
             generators.append(
                 self.generate(prompt,
-                              i,
+                              session_ids[i],
                               gen_config=gen_config[i],
                               stream_response=True,
                               sequence_start=True,
@@ -404,12 +409,13 @@ def stream_infer(
             gen_config = [gen_config] * len(prompts)
         assert len(prompts) == len(gen_config),\
                 'input gen_confg length differs from the length of prompts' # noqa
+        session_ids = [next(self._session_id) for _ in range(len(prompts))]
         outputs = Queue()
         generators = []
         for i, prompt in enumerate(prompts):
             generators.append(
                 self.generate(prompt,
-                              i,
+                              session_ids[i],
                               gen_config=gen_config[i],
                               stream_response=True,
                               sequence_start=True,
@@ -421,9 +427,14 @@ def stream_infer(
         async def _inner_call(i, generator):
             async for out in generator:
                 outputs.put(
-                    Response(out.response, out.generate_token_len,
-                             out.input_token_len, i, out.finish_reason,
-                             out.token_ids, out.logprobs))
+                    Response(out.response,
+                             out.generate_token_len,
+                             out.input_token_len,
+                             session_ids[i],
+                             out.finish_reason,
+                             out.token_ids,
+                             out.logprobs,
+                             index=i))
 
         async def gather():
             await asyncio.gather(

From 02077a7d03b6bdaf905ba32e8bdd755d41d77401 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Fri, 9 Aug 2024 14:48:30 +0800
Subject: [PATCH 08/39] fix gradio autobackend (#2256)

---
 lmdeploy/serve/gradio/app.py | 1 -
 lmdeploy/serve/gradio/vl.py  | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index a3cd00351e..7812358eff 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -48,7 +48,6 @@ def run(model_path_or_server: str,
         pipeline_type, _ = get_task(model_path_or_server)
         if pipeline_type == 'vlm':
             from lmdeploy.serve.gradio.vl import run_local
-            assert backend == 'turbomind', 'vlm only support turbomind backend'
             if backend_config is not None and \
                     backend_config.session_len is None:
                 backend_config.session_len = 8192
diff --git a/lmdeploy/serve/gradio/vl.py b/lmdeploy/serve/gradio/vl.py
index 64bf8c6160..ebeb371492 100644
--- a/lmdeploy/serve/gradio/vl.py
+++ b/lmdeploy/serve/gradio/vl.py
@@ -174,7 +174,10 @@ def cancel(chatbot, session):
 
     def reset(session):
         """Reset a new session."""
-        stop(session)
+        if session is None:
+            session = Session()
+        else:
+            stop(session)
         session._step = 0
         session._message = []
         return [], session, enable_btn

From c9aaa5b63ee8e650243e684fef9b0d36f142f3cc Mon Sep 17 00:00:00 2001
From: "q.yao" <streetyao@live.com>
Date: Mon, 12 Aug 2024 15:37:26 +0800
Subject: [PATCH 09/39] remove eviction param (#2285)

---
 lmdeploy/messages.py              | 5 -----
 lmdeploy/pytorch/engine/engine.py | 1 -
 2 files changed, 6 deletions(-)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index a8d22bad10..c3e37f0f3f 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -174,8 +174,6 @@ class PytorchEngineConfig:
             by the k/v cache. For lmdeploy versions greater than `v0.2.1`,
             it defaults to 0.8, signifying the percentage of FREE GPU memory
             to be reserved for the k/v cache
-        eviction_type (str): What action to perform when kv cache
-            is full, ['recompute', 'copy'], Deprecated.
         prefill_interval (int): Interval to perform prefill,
             Default 16.
         block_size (int): paging cache block size, default 64.
@@ -198,7 +196,6 @@ class PytorchEngineConfig:
     session_len: int = None
     max_batch_size: int = 128
     cache_max_entry_count: float = 0.8
-    eviction_type: str = 'recompute'
     prefill_interval: int = 16
     block_size: int = 64
     num_cpu_blocks: int = 0
@@ -216,8 +213,6 @@ def __post_init__(self):
         assert self.tp >= 1, 'invalid tp'
         assert self.max_batch_size >= 1, 'invalid max_batch_size'
         assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count'  # noqa
-        assert self.eviction_type in ('recompute',
-                                      'copy'), 'invalid eviction_type'
         assert self.num_cpu_blocks >= 0, 'invalid num_cpu_blocks'
         assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 31e6f9fefa..5758ea08c8 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -117,7 +117,6 @@ def __init__(self,
         scheduler_config = SchedulerConfig(
             max_batches=engine_config.max_batch_size,
             max_session_len=engine_config.session_len,
-            eviction_type=engine_config.eviction_type,
             prefill_interval=engine_config.prefill_interval)
 
         # block_size = 1 to enable unified paging

From 7933956ffd1c4304eb08ecb0e8cdc23c3170b226 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Mon, 12 Aug 2024 15:38:46 +0800
Subject: [PATCH 10/39] support vlm custom image process parameters in openai
 input format (#2245)

* support vlm custom parameters in openai input format

* remove flash_attn deps

* update

* update

* update
---
 .../pytorch/configurations/phi3_vision.py     |  1 +
 lmdeploy/serve/vl_async_engine.py             | 27 ++++++-----
 lmdeploy/vl/engine.py                         | 48 ++++++++++++++-----
 lmdeploy/vl/model/base.py                     |  5 +-
 lmdeploy/vl/model/internvl.py                 | 30 ++++++++----
 lmdeploy/vl/templates.py                      | 31 +++++++-----
 6 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/lmdeploy/pytorch/configurations/phi3_vision.py b/lmdeploy/pytorch/configurations/phi3_vision.py
index 4da05d3ed4..5f633ca579 100644
--- a/lmdeploy/pytorch/configurations/phi3_vision.py
+++ b/lmdeploy/pytorch/configurations/phi3_vision.py
@@ -15,4 +15,5 @@ def build(cls, hf_config, model_path: str = None):
         """build."""
         cfg = DefaultModelConfigBuilder.build(hf_config)
         cfg.unused_modules = ['model.vision_embed_tokens']
+        cfg.init_kwargs = dict(_attn_implementation=None)
         return cfg
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index c67034d3de..09c05780fb 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -61,15 +61,20 @@ async def _get_prompt_input(self,
         results = {}
         input_ids = []
         if len(segs) > 1:
-            images = await self.vl_prompt_template.async_collect_pil_images(
-                prompt)
-            features = await self.vl_encoder.async_infer(images)
-
-            from lmdeploy.vl.templates import MiniCPMVTempateWrapper
-            if isinstance(self.vl_prompt_template, MiniCPMVTempateWrapper):
-                decorated, features = self.vl_prompt_template.update_image_token(  # noqa: E501
-                    decorated, features)
-                segs = decorated.split(IMAGE_TOKEN)
+            # yapf: disable
+            images_with_kwargs = await self.vl_prompt_template.async_collect_pil_images(prompt)  # noqa: E501
+            # yapf: enable
+            features = []
+            if len(images_with_kwargs) > 0:
+                images, image_kwargs = list(zip(*images_with_kwargs))
+                features = await self.vl_encoder.async_infer(
+                    images, image_kwargs)
+
+                from lmdeploy.vl.templates import MiniCPMVTempateWrapper
+                if isinstance(self.vl_prompt_template, MiniCPMVTempateWrapper):
+                    decorated, features = self.vl_prompt_template.update_image_token(  # noqa: E501
+                        decorated, features)
+                    segs = decorated.split(IMAGE_TOKEN)
 
             features = [x.cpu().numpy() for x in features]
             input_ids = []
@@ -91,8 +96,8 @@ async def _get_prompt_input(self,
                                                          and sequence_start))
                 input_ids.extend(seg_ids)
             ranges = np.stack([begins, ends], axis=1).tolist()
-            results['input_embeddings'] = features
-            results['input_embedding_ranges'] = ranges
+            results['input_embeddings'] = features or None
+            results['input_embedding_ranges'] = ranges or None
         else:
             input_ids = self.tokenizer.encode(decorated,
                                               add_bos=sequence_start)
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index 4cc729f8a0..4cf5cb83d5 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import inspect
 import queue
 import time
 from threading import Thread
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 from PIL.Image import Image
@@ -33,15 +34,17 @@ def __init__(self, thread_safe):
         self.thread_safe = thread_safe
         self.number = []
         self.waiting = []
+        self.kwargs = []
         self.done = []
         self.res_que = []
         self.total = 0
 
-    def enqueue(self, images: List[Image], que: Union[queue.Queue,
-                                                      asyncio.Queue]):
+    def enqueue(self, images: List[Image], kwargs: List[Dict],
+                que: Union[queue.Queue, asyncio.Queue]):
         """add ith request to manager."""
         self.number.append(len(images))
         self.waiting.extend(images)
+        self.kwargs.extend(kwargs)
         self.res_que.append(que)
         self.total += len(images)
         self.log('received', len(images))
@@ -49,10 +52,12 @@ def enqueue(self, images: List[Image], que: Union[queue.Queue,
     def dequeue(self, max_batch_size):
         """try to dequeue max batch size images."""
         inputs = self.waiting[:max_batch_size]
+        kwargs = self.kwargs[:max_batch_size]
         self.waiting = self.waiting[max_batch_size:]
+        self.kwargs = self.kwargs[max_batch_size:]
         self.total -= len(inputs)
         self.log('process', len(inputs))
-        return inputs
+        return inputs, kwargs
 
     def notify(self):
         """set result if request i is finished."""
@@ -135,20 +140,33 @@ async def _forward_loop(self):
                 while self._que.qsize() == 0:
                     await asyncio.sleep(0)
                 item = await self._que.get()
-                record.enqueue(item[0], item[1])
-            inputs = record.dequeue(self.max_batch_size)
+                record.enqueue(item[0], item[1], item[2])
+            inputs, kwargs = record.dequeue(self.max_batch_size)
             future = asyncio.get_event_loop().run_in_executor(
-                None, self.forward, inputs)
+                None, self.forward, inputs, kwargs)
             future.add_done_callback(_raise_exception_on_finish)
             outputs = await future
             record.done.extend(outputs)
             while record.notify():
                 pass
 
-    def forward(self, inputs: List[Image]):
+    def _init_input_params(self,
+                           inputs: List[Image],
+                           params: List[Dict] = None):
+        """Check and init inputs params."""
+        if params is None:
+            params = [{}] * len(inputs)
+        assert len(params) == len(inputs), \
+            'different length of inputs and kwargs'
+        return params
+
+    def forward(self, inputs: List[Image], params: List[Dict] = None):
         """Model forward."""
+        params = self._init_input_params(inputs, params)
         time_start = time.perf_counter()
-        outputs = self.model.forward(inputs)
+        func_params = inspect.signature(self.model.forward).parameters
+        func_inputs = [inputs, params] if len(func_params) > 1 else [inputs]
+        outputs = self.model.forward(*func_inputs)
         if isinstance(outputs[0], torch.Tensor):
             outputs = [x.cpu() for x in outputs]
         time_end = time.perf_counter()
@@ -156,15 +174,19 @@ def forward(self, inputs: List[Image]):
                     f'cost {time_end - time_start:.3f}s')
         return outputs
 
-    def infer(self, inputs: List[Image]):
+    def infer(self, inputs: List[Image], params: List[Dict] = None):
         """infer."""
-        results = self.forward(inputs)
+        params = self._init_input_params(inputs, params)
+        results = self.forward(inputs, params)
         return results
 
-    async def async_infer(self, inputs: List[Image]):
+    async def async_infer(self,
+                          inputs: List[Image],
+                          params: List[Dict] = None):
         """async infer."""
+        params = self._init_input_params(inputs, params)
         outputs = asyncio.Queue()
-        item = (inputs, outputs)
+        item = (inputs, params, outputs)
         if self.vision_config.thread_safe:
             self._loop.call_soon_threadsafe(self._que.put_nowait, item)
         else:
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index b3a3081934..9c5f5f6e6a 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -36,11 +36,14 @@ def build_model():
         raise NotImplementedError()
 
     @abstractmethod
-    def forward(self, images: List[PIL.Image.Image]) -> List[torch.Tensor]:
+    def forward(self,
+                images: List[PIL.Image.Image],
+                image_kwargs: List[Dict] = None) -> List[torch.Tensor]:
         """extract image feature.
 
         Args:
             images (List[PIL.Image.Image]): input images
+            image_kwargs (List[Dict]): input kwargs for each images
 
         Return:
             List[torch.Tensor]: extract image feature for each input image
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index cda51b3f64..9f8bffe3e6 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import List
+from typing import Dict, List
 
 import torch
 from PIL.Image import Image
@@ -130,13 +130,23 @@ def build_model(self):
                 self.model_path)
             self._forward_func = self._forward
 
-    def _preprocess_v1_5(self, images: List[Image]):
+    def _preprocess_v1_5(self, images: List[Image], params: List[Dict] = None):
+        if params is not None:
+            assert len(images) == len(
+                params), 'different length of images and params'
+        else:
+            params = [{}] * len(images)
+
+        image_res = {'low': 6, 'medium': 12, 'high': 24}
+
         outputs = []
-        for image in images:
+        for image, param in zip(images, params):
+            res_key = param.get('detail', 'default')
+            max_num = image_res.get(res_key, self.config.max_dynamic_patch)
             out = dynamic_preprocess(
                 image,
                 min_num=self.config.min_dynamic_patch,
-                max_num=self.config.max_dynamic_patch,
+                max_num=max_num,
                 image_size=self.config.vision_config.image_size,
                 use_thumbnail=self.config.use_thumbnail)
             out = [self.transform(x) for x in out]
@@ -144,9 +154,9 @@ def _preprocess_v1_5(self, images: List[Image]):
             outputs.append(out)
         return outputs
 
-    def _forward_v1_5(self, images: List[Image]):
+    def _forward_v1_5(self, images: List[Image], params: List[Dict] = None):
         """forward for internvl-chat-v1-5."""
-        outputs = self._preprocess_v1_5(images)
+        outputs = self._preprocess_v1_5(images, params)
         split = [x.shape[0] for x in outputs]
         outputs = torch.cat(outputs, dim=0)
         outputs = outputs.to(self.model.device, dtype=torch.float16)
@@ -155,7 +165,7 @@ def _forward_v1_5(self, images: List[Image]):
         outputs = [x.reshape(-1, x.shape[-1]) for x in outputs]
         return outputs
 
-    def _forward(self, images: List[Image]):
+    def _forward(self, images: List[Image], params: List[Dict] = None):
         """forward for internvl-chat-v1-1, internvl-chat-v1-2."""
         pixel_values = self.image_processor(images=images,
                                             return_tensors='pt').pixel_values
@@ -166,7 +176,9 @@ def _forward(self, images: List[Image]):
         return outputs
 
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
+    def forward(self,
+                images: List[Image],
+                params: List[Dict] = None) -> List[torch.Tensor]:
         """forward."""
         images = [x.convert('RGB') for x in images]
-        return self._forward_func(images)
+        return self._forward_func(images, params)
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index 34f367a247..fe7f01edfe 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -70,9 +70,9 @@ def prompt_to_messages(self, prompt: VLPromptType):
         return [messages]
 
     async def async_collect_pil_images(
-            self, messages: Dict) -> List[PIL.Image.Image]:
+            self, messages: Dict) -> List[Tuple[PIL.Image.Image, Dict]]:
         """collect image from messages."""
-        images = []
+        images_with_kwargs = []
         for message in messages:
             role = message['role']
             content = message['content']
@@ -82,22 +82,31 @@ async def async_collect_pil_images(
                 # 'image_url': means url or local path to image.
                 # 'image_data': means PIL.Image.Image object.
                 if item['type'] == 'image_url':
-                    url = item['image_url']['url']
-                    images.append(url)
+                    item_copy = item['image_url'].copy()
+                    try:
+                        url = item_copy.pop('url')
+                        images_with_kwargs.append([url, item_copy])
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
                 elif item['type'] == 'image_data':
-                    data = item['image_data']['data']
-                    images.append(data)
+                    item_copy = item['image_data'].copy()
+                    try:
+                        data = item_copy.pop('data')
+                        images_with_kwargs.append([data, item_copy])
+                    except KeyError:
+                        logger.error(f'invalid format {message}')
 
         def _inner_call(i, images):
-            url_or_data = images[i]
-            images[i] = load_image(url_or_data)
+            url_or_data = images[i][0]
+            images[i][0] = load_image(url_or_data)
 
         await asyncio.gather(*[
-            asyncio.get_event_loop().run_in_executor(
-                None, _inner_call, i, images) for i in range(len(images))
+            asyncio.get_event_loop().run_in_executor(None, _inner_call, i,
+                                                     images_with_kwargs)
+            for i in range(len(images_with_kwargs))
         ])
 
-        return images
+        return images_with_kwargs
 
     def append_image_token(self, prompt, num_images: int):
         """append image token to user prompt."""

From f8f8543df3db34f0f12ee7f9f085542694f00fdd Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 12 Aug 2024 16:38:06 +0800
Subject: [PATCH 11/39] fix side-effect: failed to update tm model config with
 tm engine config (#2275)

* fix side-effect: failed to update tm model config with tm engine config

* fix
---
 lmdeploy/turbomind/deploy/converter.py | 18 ++++++++++--------
 lmdeploy/turbomind/turbomind.py        |  5 ++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 59a038da97..28012a7263 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -7,6 +7,7 @@
 import torch
 
 from lmdeploy.archs import get_model_arch
+from lmdeploy.messages import TurbomindEngineConfig
 from lmdeploy.model import MODELS, best_match_model
 from lmdeploy.utils import get_logger, get_model
 
@@ -166,9 +167,8 @@ def pack_model_repository(workspace_path: str):
 def get_tm_model(model_path,
                  model_name,
                  chat_template_name,
-                 model_format,
                  group_size,
-                 tp,
+                 engine_config,
                  out_dir: str = None):
     # TODO: open the following condition check in another PR,
     # CLI needs to be updated
@@ -176,18 +176,19 @@ def get_tm_model(model_path,
     #     raise RuntimeError(
     #         'group_size should be specified when the model is awq')
 
-    input_model_name = get_input_model_registered_name(model_path,
-                                                       model_format)
+    input_model_name = get_input_model_registered_name(
+        model_path, engine_config.model_format)
     input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
                                                      tokenizer_path=model_path)
 
     output_model_name, cfg = get_output_model_registered_name_and_config(
         model_path=model_path,
-        model_format=model_format,
+        model_format=engine_config.model_format,
         group_size=group_size)
 
     cfg.chat_template = chat_template_name
-    cfg.tensor_para_size = tp
+    cfg.model_name = model_name
+    cfg.update_from_engine_config(engine_config)
 
     output_model = OUTPUT_MODELS.get(output_model_name)(
         input_model=input_model, cfg=cfg, out_dir=out_dir)
@@ -263,8 +264,9 @@ def main(model_name: str,
     tm_weight_path, tm_tokenizer_path = create_workspace(dst_path)
     copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path)
 
-    tm_model = get_tm_model(model_path, model_name, chat_template,
-                            model_format, group_size, tp, tm_weight_path)
+    engine_config = TurbomindEngineConfig(tp=tp, model_format=model_format)
+    tm_model = get_tm_model(model_path, model_name, chat_template, group_size,
+                            engine_config, tm_weight_path)
     tm_model.export()
 
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 718cd49d10..5fd9824a03 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -179,9 +179,8 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
 
         # convert transformers model into turbomind model format
         tm_model = get_tm_model(model_path, self.model_name,
-                                self.chat_template_name,
-                                engine_config.model_format, group_size,
-                                engine_config.tp)
+                                self.chat_template_name, group_size,
+                                engine_config)
 
         self.config = tm_model.cfg
         logger.info(f'model_config:\n\n{self.config.toini()}')

From 85daad96ec4832e47678066ff07f6d6cc69697d0 Mon Sep 17 00:00:00 2001
From: wuhongsheng <664116298@qq.com>
Date: Tue, 13 Aug 2024 16:15:28 +0800
Subject: [PATCH 12/39] Update python support version (#2290)

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index cfa15e76ae..32a69c600c 100644
--- a/setup.py
+++ b/setup.py
@@ -157,6 +157,7 @@ def gen_packages_items():
             'Programming Language :: Python :: 3.9',
             'Programming Language :: Python :: 3.10',
             'Programming Language :: Python :: 3.11',
+            'Programming Language :: Python :: 3.12',
             'Intended Audience :: Developers',
             'Intended Audience :: Education',
             'Intended Audience :: Science/Research',

From ebae7d28d9dcf2b62dab926770d97089764e69da Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:42:28 +0800
Subject: [PATCH 13/39] Remove QoS serving (#2294)

---
 lmdeploy/cli/serve.py                         |   7 +-
 lmdeploy/serve/openai/api_server.py           | 393 +-----------------
 lmdeploy/serve/openai/protocol.py             |  64 ---
 lmdeploy/serve/qos_engine/__init__.py         |   1 -
 lmdeploy/serve/qos_engine/inner_group_schd.py |  73 ----
 .../serve/qos_engine/qos_config.json.template |  58 ---
 lmdeploy/serve/qos_engine/qos_engine.py       | 256 ------------
 lmdeploy/serve/qos_engine/usage_stats.py      | 136 ------
 8 files changed, 8 insertions(+), 980 deletions(-)
 delete mode 100644 lmdeploy/serve/qos_engine/__init__.py
 delete mode 100644 lmdeploy/serve/qos_engine/inner_group_schd.py
 delete mode 100644 lmdeploy/serve/qos_engine/qos_config.json.template
 delete mode 100644 lmdeploy/serve/qos_engine/qos_engine.py
 delete mode 100644 lmdeploy/serve/qos_engine/usage_stats.py

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 33ca0c36e4..85a1a13dad 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -125,10 +125,6 @@ def add_parser_api_server():
                             type=str,
                             default=['*'],
                             help='A list of allowed http headers for cors')
-        parser.add_argument('--qos-config-path',
-                            type=str,
-                            default='',
-                            help='Qos policy config path')
         # common args
         ArgumentHelper.backend(parser)
         ArgumentHelper.log_level(parser)
@@ -288,8 +284,7 @@ def api_server(args):
                        allow_headers=args.allow_headers,
                        log_level=args.log_level.upper(),
                        api_keys=args.api_keys,
-                       ssl=args.ssl,
-                       qos_config_path=args.qos_config_path)
+                       ssl=args.ssl)
 
     @staticmethod
     def api_client(args):
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 1988b92c26..301ff76763 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -18,17 +18,15 @@
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.serve.openai.protocol import (  # noqa: E501
-    ChatCompletionRequest, ChatCompletionRequestQos, ChatCompletionResponse,
+    ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatCompletionTokenLogprob, ChatMessage,
-    ChoiceLogprobs, CompletionRequest, CompletionRequestQos,
-    CompletionResponse, CompletionResponseChoice,
-    CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
-    EmbeddingsRequest, EncodeRequest, EncodeResponse, ErrorResponse,
-    FunctionResponse, GenerateRequest, GenerateRequestQos, GenerateResponse,
-    LogProbs, ModelCard, ModelList, ModelPermission, ToolCall, TopLogprob,
-    UsageInfo)
-from lmdeploy.serve.qos_engine.qos_engine import QosEngine
+    ChoiceLogprobs, CompletionRequest, CompletionResponse,
+    CompletionResponseChoice, CompletionResponseStreamChoice,
+    CompletionStreamResponse, DeltaMessage, EmbeddingsRequest, EncodeRequest,
+    EncodeResponse, ErrorResponse, FunctionResponse, GenerateRequest,
+    GenerateResponse, LogProbs, ModelCard, ModelList, ModelPermission,
+    ToolCall, TopLogprob, UsageInfo)
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
 from lmdeploy.utils import get_logger
 
@@ -40,7 +38,6 @@ class VariableInterface:
     async_engine: AsyncEngine = None
     session_id: int = 0
     api_keys: Optional[List[str]] = None
-    qos_engine: QosEngine = None
     request_hosts = []
 
 
@@ -239,135 +236,6 @@ async def health() -> Response:
     return Response(status_code=200)
 
 
-@app.post('/v1/chat/completions_qos')
-async def chat_completions_v1_qos(request: ChatCompletionRequestQos,
-                                  raw_request: Request = None):
-    """Completion API similar to OpenAI's API.
-
-    Refer to  `https://platform.openai.com/docs/api-reference/chat/create`
-    for the API specification.
-
-    The request should be a JSON object with the following fields:
-    - model: model name. Available from /v1/models.
-    - messages: string prompt or chat history in OpenAI format.
-    - temperature (float): to modulate the next token probability
-    - top_p (float): If set to float < 1, only the smallest set of most
-        probable tokens with probabilities that add up to top_p or higher
-        are kept for generation.
-    - n (int): How many chat completion choices to generate for each input
-        message. **Only support one here**.
-    - stream: whether to stream the results or not. Default to false.
-    - max_tokens (int): output token nums
-    - repetition_penalty (float): The parameter for repetition penalty.
-        1.0 means no penalty
-
-    Additional arguments supported by LMDeploy:
-    - ignore_eos (bool): indicator for ignoring eos
-    - user_id (str): for qos; if not specified, will set to "default"
-
-    Currently we do not support the following features:
-    - function_call (Users should implement this by themselves)
-    - logit_bias (not supported yet)
-    - presence_penalty (replaced with repetition_penalty)
-    - frequency_penalty (replaced with repetition_penalty)
-    """
-    VariableInterface.session_id += 1
-    request.session_id = VariableInterface.session_id
-    error_check_ret = await check_request(request)
-    if error_check_ret is not None:
-        return error_check_ret
-
-    model_name = request.model
-    request_id = str(request.session_id)
-    created_time = int(time.time())
-
-    if VariableInterface.qos_engine is None:
-        return create_error_response(
-            HTTPStatus.NOT_FOUND,
-            'cannot parse qos engine config, this api is not work')
-
-    result_generator = await VariableInterface.qos_engine.generate_with_qos(
-        request)
-
-    if result_generator is None:
-        return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR,
-                                     'Failed to generate completions')
-
-    def create_stream_response_json(
-        index: int,
-        text: str,
-        finish_reason: Optional[str] = None,
-    ) -> str:
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=index,
-            delta=DeltaMessage(role='assistant', content=text),
-            finish_reason=finish_reason,
-        )
-        response = ChatCompletionStreamResponse(
-            id=request_id,
-            created=created_time,
-            model=model_name,
-            choices=[choice_data],
-        )
-        response_json = response.model_dump_json()
-
-        return response_json
-
-    async def completion_stream_generator() -> AsyncGenerator[str, None]:
-        async for res in result_generator:
-            response_json = create_stream_response_json(
-                index=0,
-                text=res.response,
-            )
-            yield f'data: {response_json}\n\n'
-        yield 'data: [DONE]\n\n'
-
-    # Streaming response
-    if request.stream:
-        return StreamingResponse(completion_stream_generator(),
-                                 media_type='text/event-stream')
-
-    # Non-streaming response
-    final_res = None
-    text = ''
-    async for res in result_generator:
-        if await raw_request.is_disconnected():
-            # Abort the request if the client disconnects.
-            await VariableInterface.async_engine.stop_session(
-                request.session_id)
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                         'Client disconnected')
-        final_res = res
-        text += res.response
-    assert final_res is not None
-    choices = []
-    choice_data = ChatCompletionResponseChoice(
-        index=0,
-        message=ChatMessage(role='assistant', content=text),
-        finish_reason=final_res.finish_reason,
-    )
-    choices.append(choice_data)
-
-    total_tokens = sum([
-        final_res.history_token_len, final_res.input_token_len,
-        final_res.generate_token_len
-    ])
-    usage = UsageInfo(
-        prompt_tokens=final_res.input_token_len,
-        completion_tokens=final_res.generate_token_len,
-        total_tokens=total_tokens,
-    )
-    response = ChatCompletionResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        choices=choices,
-        usage=usage,
-    )
-
-    return response
-
-
 @app.post('/v1/chat/completions', dependencies=[Depends(check_api_key)])
 async def chat_completions_v1(request: ChatCompletionRequest,
                               raw_request: Request = None):
@@ -596,157 +464,6 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
     return response
 
 
-@app.post('/v1/completions_qos')
-async def completions_v1_qos(request: CompletionRequestQos,
-                             raw_request: Request = None):
-    """Completion API similar to OpenAI's API.
-
-    Go to `https://platform.openai.com/docs/api-reference/completions/create`
-    for the API specification.
-
-    The request should be a JSON object with the following fields:
-    - model (str): model name. Available from /v1/models.
-    - prompt (str): the input prompt.
-    - suffix (str): The suffix that comes after a completion of inserted text.
-    - max_tokens (int): output token nums
-    - temperature (float): to modulate the next token probability
-    - top_p (float): If set to float < 1, only the smallest set of most
-        probable tokens with probabilities that add up to top_p or higher
-        are kept for generation.
-    - n (int): How many chat completion choices to generate for each input
-        message. **Only support one here**.
-    - stream: whether to stream the results or not. Default to false.
-    - repetition_penalty (float): The parameter for repetition penalty.
-        1.0 means no penalty
-    - user (str): A unique identifier representing your end-user.
-
-    Additional arguments supported by LMDeploy:
-    - top_k (int): The number of the highest probability vocabulary
-        tokens to keep for top-k-filtering
-    - ignore_eos (bool): indicator for ignoring eos
-    - user_id (str): for qos; if not specified, will set to "default"
-
-    Currently we do not support the following features:
-    - logprobs (not supported yet)
-    - presence_penalty (replaced with repetition_penalty)
-    - frequency_penalty (replaced with repetition_penalty)
-    """
-    VariableInterface.session_id += 1
-    request.session_id = VariableInterface.session_id
-    error_check_ret = await check_request(request)
-    if error_check_ret is not None:
-        return error_check_ret
-
-    model_name = request.model
-    request_id = str(request.session_id)
-    created_time = int(time.time())
-    if isinstance(request.prompt, str):
-        request.prompt = [request.prompt]
-
-    if VariableInterface.qos_engine is None:
-        return create_error_response(
-            HTTPStatus.NOT_FOUND,
-            'cannot parse qos engine config, this api is not work')
-
-    generators = await VariableInterface.qos_engine.generate_with_qos(request)
-
-    def create_stream_response_json(
-        index: int,
-        text: str,
-        finish_reason: Optional[str] = None,
-        usage: Optional[UsageInfo] = None,
-    ) -> str:
-        choice_data = CompletionResponseStreamChoice(
-            index=index,
-            text=text,
-            finish_reason=finish_reason,
-        )
-        response = CompletionStreamResponse(
-            id=request_id,
-            created=created_time,
-            model=model_name,
-            choices=[choice_data],
-            usage=usage,
-        )
-        response_json = response.model_dump_json()
-
-        return response_json
-
-    async def completion_stream_generator() -> AsyncGenerator[str, None]:
-        # First chunk with role
-        for generator in generators:
-            async for res in generator:
-                usage = None
-                if res.finish_reason is not None:
-                    final_res = res
-                    total_tokens = sum([
-                        final_res.history_token_len, final_res.input_token_len,
-                        final_res.generate_token_len
-                    ])
-                    usage = UsageInfo(
-                        prompt_tokens=final_res.input_token_len,
-                        completion_tokens=final_res.generate_token_len,
-                        total_tokens=total_tokens,
-                    )
-                response_json = create_stream_response_json(
-                    index=0,
-                    text=res.response,
-                    usage=usage,
-                )
-                yield f'data: {response_json}\n\n'
-        yield 'data: [DONE]\n\n'
-
-    # Streaming response
-    if request.stream:
-        return StreamingResponse(completion_stream_generator(),
-                                 media_type='text/event-stream')
-
-    # Non-streaming response
-    usage = UsageInfo()
-    choices = []
-
-    async def _inner_call(i, generator):
-        final_res = None
-        text = ''
-        async for res in generator:
-            if await raw_request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await VariableInterface.async_engine.stop_session(
-                    request.session_id)
-                return create_error_response(HTTPStatus.BAD_REQUEST,
-                                             'Client disconnected')
-            final_res = res
-            text += res.response
-        assert final_res is not None
-        choice_data = CompletionResponseChoice(
-            index=0,
-            text=text,
-            finish_reason=final_res.finish_reason,
-        )
-        choices.append(choice_data)
-
-        total_tokens = sum([
-            final_res.history_token_len, final_res.input_token_len,
-            final_res.generate_token_len
-        ])
-        usage.prompt_tokens += final_res.input_token_len
-        usage.completion_tokens += final_res.generate_token_len
-        usage.total_tokens += total_tokens
-
-    await asyncio.gather(
-        *[_inner_call(i, generators[i]) for i in range(len(generators))])
-
-    response = CompletionResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        choices=choices,
-        usage=usage,
-    )
-
-    return response
-
-
 @app.post('/v1/completions', dependencies=[Depends(check_api_key)])
 async def completions_v1(request: CompletionRequest,
                          raw_request: Request = None):
@@ -990,83 +707,6 @@ def encode(prompt: str, do_preprocess: bool, add_bos: bool):
         return EncodeResponse(input_ids=encoded, length=length)
 
 
-@app.post('/v1/chat/interactive_qos')
-async def chat_interactive_v1_qos(request: GenerateRequestQos,
-                                  raw_request: Request = None):
-    """Generate completion for the request.
-
-    - On interactive mode, the chat history is kept on the server. Please set
-    `interactive_mode = True`.
-    - On normal mode, no chat history is kept on the server. Set
-    `interactive_mode = False`.
-
-    The request should be a JSON object with the following fields:
-    - prompt: the prompt to use for the generation.
-    - session_id: determine which instance will be called. If not specified
-        with a value other than -1, using random value directly.
-    - interactive_mode (bool): turn on interactive mode or not. On interactive
-        mode, session history is kept on the server (and vice versa).
-    - stream: whether to stream the results or not.
-    - stop: whether to stop the session response or not.
-    - request_output_len (int): output token nums
-    - top_p (float): If set to float < 1, only the smallest set of most
-        probable tokens with probabilities that add up to top_p or higher
-        are kept for generation.
-    - top_k (int): The number of the highest probability vocabulary
-        tokens to keep for top-k-filtering
-    - temperature (float): to modulate the next token probability
-    - repetition_penalty (float): The parameter for repetition penalty.
-        1.0 means no penalty
-    - ignore_eos (bool): indicator for ignoring eos
-    - user_id (str): for qos; if not specified, will set to "default"
-    """
-    error_check_ret = await check_request(request)
-    if error_check_ret is not None:
-        return error_check_ret
-    if request.session_id == -1:
-        VariableInterface.session_id += 1
-        request.session_id = VariableInterface.session_id
-
-    if VariableInterface.qos_engine is None:
-        return create_error_response(
-            HTTPStatus.NOT_FOUND,
-            'cannot parse qos engine config, this api is not work')
-
-    generation = await VariableInterface.qos_engine.generate_with_qos(request)
-
-    # Streaming case
-    async def stream_results() -> AsyncGenerator[bytes, None]:
-        async for out in generation:
-            chunk = GenerateResponse(text=out.response,
-                                     tokens=out.generate_token_len,
-                                     input_tokens=out.input_token_len,
-                                     history_tokens=out.history_token_len,
-                                     finish_reason=out.finish_reason)
-            data = chunk.model_dump_json()
-            yield f'{data}\n'
-
-    if request.stream:
-        return StreamingResponse(stream_results(),
-                                 media_type='text/event-stream')
-    else:
-        ret = {}
-        text = ''
-        tokens = 0
-        finish_reason = None
-        async for out in generation:
-            if await raw_request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await VariableInterface.qos_engine.stop_session(
-                    request.session_id)
-                return create_error_response(HTTPStatus.BAD_REQUEST,
-                                             'Client disconnected')
-            text += out.response
-            tokens = out.generate_token_len
-            finish_reason = out.finish_reason
-        ret = {'text': text, 'tokens': tokens, 'finish_reason': finish_reason}
-        return JSONResponse(ret)
-
-
 @app.post('/v1/chat/interactive', dependencies=[Depends(check_api_key)])
 async def chat_interactive_v1(request: GenerateRequest,
                               raw_request: Request = None):
@@ -1217,7 +857,6 @@ def serve(model_path: str,
           log_level: str = 'ERROR',
           api_keys: Optional[Union[List[str], str]] = None,
           ssl: bool = False,
-          qos_config_path: str = '',
           **kwargs):
     """An example to perform model inference through the command line
     interface.
@@ -1256,7 +895,6 @@ def serve(model_path: str,
         api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
             a single api_key. Default to None, which means no api key applied.
         ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
-        qos_config_path (str): qos policy config path
     """ # noqa E501
     if os.getenv('TM_LOG_LEVEL') is None:
         os.environ['TM_LOG_LEVEL'] = log_level
@@ -1290,23 +928,6 @@ def serve(model_path: str,
         chat_template_config=chat_template_config,
         **kwargs)
 
-    if qos_config_path:
-        try:
-            with open(qos_config_path, 'r') as file:
-                qos_config_str = file.read()
-                VariableInterface.qos_engine = QosEngine(
-                    qos_tag=qos_config_str,
-                    engine=VariableInterface.async_engine,
-                    **kwargs)
-                VariableInterface.qos_engine.start()
-        except FileNotFoundError:
-            VariableInterface.qos_engine = None
-    else:
-        # hide qos functions if not applied
-        for i in range(len(app.router.routes)):
-            if 'qos' in app.router.routes[i].path:
-                app.router.routes[i].include_in_schema = False
-
     for i in range(3):
         print(
             f'HINT:    Please open \033[93m\033[1m{http_or_https}://'
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 635ad1d519..59a2a3968a 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -55,29 +55,6 @@ class UsageInfo(BaseModel):
     completion_tokens: Optional[int] = 0
 
 
-class ChatCompletionRequestQos(BaseModel):
-    """Chat completion request."""
-    model: str
-    messages: Union[str, List[Dict[str, str]]]
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 1.0
-    logprobs: Optional[bool] = False
-    top_logprobs: Optional[int] = None
-    n: Optional[int] = 1
-    max_tokens: Optional[int] = Field(default=None, examples=[None])
-    stop: Optional[bool] = False
-    stream: Optional[bool] = False
-    presence_penalty: Optional[float] = 0.0
-    frequency_penalty: Optional[float] = 0.0
-    user: Optional[str] = None
-    user_id: Optional[str] = None
-    # additional argument of lmdeploy
-    repetition_penalty: Optional[float] = 1.0
-    session_id: Optional[int] = -1
-    ignore_eos: Optional[bool] = False
-    top_k: Optional[int] = 40
-
-
 class Function(BaseModel):
     """Function descriptions."""
     description: Optional[str] = Field(default=None, examples=[None])
@@ -242,31 +219,6 @@ class CompletionRequest(BaseModel):
     top_k: Optional[int] = 40  # for opencompass
 
 
-class CompletionRequestQos(BaseModel):
-    """Completion request."""
-    model: str
-    prompt: Union[str, List[Any]]
-    suffix: Optional[str] = None
-    temperature: Optional[float] = 0.7
-    n: Optional[int] = 1
-    logprobs: Optional[int] = None
-    max_tokens: Optional[int] = 16
-    stop: Optional[Union[str, List[str]]] = None
-    stream: Optional[bool] = False
-    top_p: Optional[float] = 1.0
-    logprobs: Optional[int] = None
-    echo: Optional[bool] = False
-    presence_penalty: Optional[float] = 0.0
-    frequency_penalty: Optional[float] = 0.0
-    user: Optional[str] = None
-    # additional argument of lmdeploy
-    top_k: Optional[int] = 40
-    repetition_penalty: Optional[float] = 1.0
-    session_id: Optional[int] = -1
-    ignore_eos: Optional[bool] = False
-    user_id: Optional[str] = None
-
-
 class CompletionResponseChoice(BaseModel):
     """Completion response choices."""
     index: int
@@ -352,22 +304,6 @@ class GenerateRequest(BaseModel):
     adapter_name: Optional[str] = Field(default=None, examples=[None])
 
 
-class GenerateRequestQos(BaseModel):
-    """Generate request."""
-    prompt: Union[str, List[Dict[str, str]]]
-    session_id: int = -1
-    interactive_mode: bool = False
-    stream: bool = False
-    stop: bool = False
-    request_output_len: int = 512
-    top_p: float = 0.8
-    top_k: int = 40
-    temperature: float = 0.8
-    repetition_penalty: float = 1.0
-    ignore_eos: bool = False
-    user_id: Optional[str] = None
-
-
 class GenerateResponse(BaseModel):
     """Generate response."""
     text: str
diff --git a/lmdeploy/serve/qos_engine/__init__.py b/lmdeploy/serve/qos_engine/__init__.py
deleted file mode 100644
index ef101fec61..0000000000
--- a/lmdeploy/serve/qos_engine/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/serve/qos_engine/inner_group_schd.py b/lmdeploy/serve/qos_engine/inner_group_schd.py
deleted file mode 100644
index 30cd4edd12..0000000000
--- a/lmdeploy/serve/qos_engine/inner_group_schd.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import collections
-
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-
-class UserRequestQueue:
-    """Inner group user request queues."""
-
-    def __init__(self, group: str, user_id_map: dict):
-        self.group = group
-        self.user_queue_map = dict()
-        self.user_quota_map = dict()
-        self.user_id_maps = user_id_map
-
-        total_quota = 0
-        for item in user_id_map:
-            total_quota += item['quota_pct']
-        for item in user_id_map:
-            user_id = item['id']
-            self.user_queue_map[user_id] = collections.deque()
-            self.user_quota_map[user_id] = item['quota_pct'] / total_quota
-
-    def enqueue(self, request_event):
-        """Enqueue request to corresponding user queue."""
-        if request_event[0].user_id in self.user_queue_map:
-            self.user_queue_map[request_event[0].user_id].append(request_event)
-        else:
-            self.user_queue_map['default'].append(request_event)
-
-    def empty(self):
-        """Whether all user queues are empty."""
-        for _, req_queue in self.user_queue_map.items():
-            if len(req_queue) != 0:
-                return False
-        return True
-
-    def dequeue(self, usage_stats):
-        """Dequeue the request to serve."""
-        uid_to_serve = self.user_to_serve(usage_stats)
-        if uid_to_serve in self.user_queue_map:
-            return self.user_queue_map[uid_to_serve].popleft()
-
-        return None
-
-    def user_to_serve(self, usage_stats):
-        """Inner group scheduling.
-
-        Find the user to serve from user request queues.
-        """
-        min_usage = 100
-        uid_to_serve = ''
-        for uid, req_queue in self.user_queue_map.items():
-            if len(req_queue) == 0:
-                continue
-
-            # TODO: include token length
-            # Calculate current user's actual used share and quota share
-            user_usage, _, group_usage, _ = usage_stats.get_user_usage(
-                uid, self.group)
-            actual_share = (user_usage / group_usage) if group_usage > 0 else 0
-            due_share = self.user_quota_map[uid]
-
-            # Serve the user with the relatively least usage share
-            curr_usage = (actual_share / due_share) if due_share > 0 else 0
-            if curr_usage == 0:
-                return uid
-            if curr_usage < min_usage:
-                uid_to_serve = uid
-                min_usage = curr_usage
-        return uid_to_serve
diff --git a/lmdeploy/serve/qos_engine/qos_config.json.template b/lmdeploy/serve/qos_engine/qos_config.json.template
deleted file mode 100644
index 1120fbdd27..0000000000
--- a/lmdeploy/serve/qos_engine/qos_config.json.template
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "enable_user_qos": 1,
-    "user_groups": ["Platinum", "Gold", "Silver", "Bronze"],
-    "user_group_map": {
-        "Platinum": [
-        {
-            "id": "user_id0",
-            "quota_pct": 100
-        },
-        {
-            "id": "default",
-            "quota_pct": 0
-        }
-        ],
-        "Gold": [
-        {
-            "id": "user_id1",
-            "quota_pct": 50
-        },
-        {
-            "id": "user_id2",
-            "quota_pct": 50
-        },
-        {
-            "id": "default",
-            "quota_pct": 0
-        }
-        ],
-        "Silver": [
-        {
-            "id": "user_id3",
-            "quota_pct": 5
-        },
-        {
-            "id": "default",
-            "quota_pct": 95
-        }
-        ],
-        "Bronze": [
-        {
-            "id": "user_id4",
-            "quota_pct": 30
-        },
-        {
-            "id": "user_id5",
-            "quota_pct": 30
-        },
-        {
-            "id": "user_id6",
-            "quota_pct": 40
-        },
-        {
-            "id": "default",
-            "quota_pct": 0
-        }
-        ]
-    }
-}
diff --git a/lmdeploy/serve/qos_engine/qos_engine.py b/lmdeploy/serve/qos_engine/qos_engine.py
deleted file mode 100644
index df1bf8e413..0000000000
--- a/lmdeploy/serve/qos_engine/qos_engine.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import asyncio
-import json
-import threading
-import time
-from typing import List
-
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequestQos,
-                                            CompletionRequestQos,
-                                            GenerateRequestQos)
-from lmdeploy.serve.qos_engine.inner_group_schd import UserRequestQueue
-from lmdeploy.serve.qos_engine.usage_stats import UsageStats
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-
-class QosConfig:
-    """qos config class: parse qosconfig for qos engine."""
-
-    def __init__(self, qos_tag=''):
-        qos_config = json.loads(qos_tag)
-        self.is_qos_enabled = qos_config.get('enable_user_qos', False)
-        logger.debug(f'is_qos_enabled: {self.is_qos_enabled}')
-
-        if self.is_qos_enabled:
-            self.user_id_maps = qos_config['user_group_map']
-            self.user_group_prio = qos_config['user_groups']
-            logger.debug(f'user_id_maps:  {self.user_id_maps}')
-            logger.debug(f'user_group_prio: {self.user_group_prio}')
-
-
-class QosEngine:
-    """impl for qos engine, docs/en/qos.md."""
-
-    def __init__(self, qos_tag='', engine=None, **kwargs) -> None:
-        self.engine = engine
-        self.availSlots = engine.instance_num
-        self._stop_event = threading.Event()
-        self._dequeue_thread = threading.Thread(target=self._serve,
-                                                daemon=True)
-        self.qos_config = QosConfig(qos_tag)
-
-        self.qos_user_group = QosGroupQueue(self.qos_config)
-
-        self.usage_stats = UsageStats(
-            total_duration=60,
-            buffer_count=6,
-            start_index=0,
-            user_groups=self.qos_config.user_group_prio)
-        self.user_served_reqs = dict()
-        self._dump_stats_thread = threading.Thread(target=self._dump_stats,
-                                                   daemon=True)
-
-        self.lock = threading.Lock()
-        self.stats_lock = threading.Lock()
-
-    def start(self):
-        """start qos engine."""
-        if self.is_qos_enabled():
-            self._dequeue_thread.start()
-            self._dump_stats_thread.start()
-
-    def is_qos_enabled(self):
-        """check while qos engine is enabled."""
-        return self.qos_config.is_qos_enabled
-
-    async def stop_session(self, session_id: int):
-        """Stop a session by a session_id."""
-        await self.engine.stop_session(session_id)
-
-    async def generate(self, request):
-        """entry of qos engine generate for three api."""
-        if isinstance(request, CompletionRequestQos):
-            if isinstance(request.prompt, str):
-                request.prompt = [request.prompt]
-            generators = []
-            for i in range(len(request.prompt)):
-                result_generator = self.engine.generate(
-                    request.prompt[i],
-                    request.session_id + i,
-                    stream_response=True,  # always use stream for batching
-                    sequence_start=True,
-                    sequence_end=True,
-                    request_output_len=request.max_tokens
-                    if request.max_tokens else 512,
-                    stop=False,
-                    top_p=request.top_p,
-                    temperature=request.temperature,
-                    repetition_penalty=request.repetition_penalty,
-                    ignore_eos=request.ignore_eos,
-                    do_preprocess=False)
-                generators.append(result_generator)
-            return generators
-
-        elif isinstance(request, GenerateRequestQos):
-            async_engine = self.engine
-            sequence_start = async_engine.id2step.get(str(request.session_id),
-                                                      0) == 0
-            sequence_end = not request.interactive_mode
-
-            generation = async_engine.generate(
-                request.prompt,
-                request.session_id,
-                stream_response=True,  # always use stream to enable batching
-                sequence_start=sequence_start,
-                sequence_end=sequence_end,
-                request_output_len=request.request_output_len,
-                top_p=request.top_p,
-                top_k=request.top_k,
-                stop=request.stop,
-                temperature=request.temperature,
-                repetition_penalty=request.repetition_penalty,
-                ignore_eos=request.ignore_eos)
-            return generation
-
-        elif isinstance(request, ChatCompletionRequestQos):
-            # default chat/completions
-            result_generator = self.engine.generate(
-                request.messages,
-                request.session_id,
-                stream_response=True,  # always use stream to enable batching
-                sequence_start=True,
-                sequence_end=True,
-                request_output_len=request.max_tokens
-                if request.max_tokens else 512,
-                stop=request.stop,
-                top_p=request.top_p,
-                top_k=request.top_k,
-                temperature=request.temperature,
-                repetition_penalty=request.repetition_penalty,
-                ignore_eos=request.ignore_eos)
-            return result_generator
-
-        return time.sleep(0.01)
-
-    async def generate_with_qos(self, request):
-        """called by api server for qos generate."""
-        if not self.is_qos_enabled():
-            return await self.generate(request)
-
-        # push (request,event) to queue
-        event = asyncio.Event()
-        request_event = (request, event)
-        with self.lock:
-            self.qos_user_group.enqueue(request_event)
-
-        await event.wait()
-
-        result_generator = await self.generate(request)
-
-        # release self.availSlots resources
-        with self.lock:
-            if isinstance(request, CompletionRequestQos) and isinstance(
-                    request.prompt, List):
-                self.availSlots += len(request.prompt)
-            else:
-                self.availSlots += 1
-
-        # Update number of served requests for each user
-        with self.stats_lock:
-            if request.user_id not in self.user_served_reqs:
-                self.user_served_reqs[request.user_id] = 1
-            else:
-                self.user_served_reqs[request.user_id] += 1
-
-        return result_generator
-
-    def _serve(self):
-        """backend thread for dequeue."""
-        while not self._stop_event.is_set():
-            if self.availSlots > 0:
-                with self.lock:
-                    request_event = self.dequeue(self.usage_stats)
-                    if request_event is not None:
-                        # Update usage_stats
-                        user_group = self.qos_user_group.get_user_group(
-                            request_event[0].user_id)
-                        self.usage_stats.update_usage(request_event[0].user_id,
-                                                      user_group, 100,
-                                                      int(time.time()))
-                        if isinstance(request_event[0],
-                                      CompletionRequestQos) and isinstance(
-                                          request_event[0].prompt, List):
-                            self.availSlots -= len(request_event[0].prompt)
-                        else:
-                            self.availSlots -= 1
-                        request_event[1].set()
-                        logger.debug(
-                            f'Available slot decrease, now: {self.availSlots}')
-            time.sleep(0)
-
-    def _dump_stats(self):
-        """dump usage states for debugs."""
-        ts = 0
-        while not self._stop_event.is_set():
-            outdata = ''
-            with self.stats_lock:
-                if not self.user_served_reqs:
-                    outdata = 'none'
-                else:
-                    sorted_uids = sorted(self.user_served_reqs.keys())
-                    for uid in sorted_uids:
-                        outdata += f'{uid} {self.user_served_reqs[uid]} reqs, '
-                    self.user_served_reqs = dict()
-            logger.info(
-                f'qos svc running for {ts} seconds,last 20 seconds: {outdata}')
-            ts += 20
-            time.sleep(20)
-
-    def dequeue(self, usage_stats):
-        """dequeue from multiqueue."""
-        return self.qos_user_group.dequeue(usage_stats)
-
-
-class QosGroupQueue:
-    """create groups for qos outer group schedule."""
-
-    def __init__(self, qos_config):
-        if qos_config is None:
-            self.user_list = {}
-            self.queues = {}
-        else:
-            self.user_list = qos_config.user_id_maps
-            self.queues = {}
-            for user_group in qos_config.user_group_prio:
-                self.queues[user_group] = UserRequestQueue(
-                    user_group, self.user_list[user_group])
-        self.user_group_list = list(self.user_list.keys())
-        self.default_user_group = self.user_group_list[2] if len(
-            self.user_group_list) >= 3 else 'None'
-        logger.debug(self.user_list)
-        logger.debug(self.queues)
-        logger.debug(self.default_user_group)
-
-    def get_user_group(self, user_id):
-        """input: user, output user_id"""
-        for category, users in self.user_list.items():
-            for user in users:
-                if user_id == user['id']:
-                    return category
-        return self.default_user_group
-
-    def enqueue(self, request_event):
-        """enqueue outer group waiting for schedule."""
-        user_id = self.get_user_group(request_event[0].user_id)
-        self.queues[user_id].enqueue(request_event)
-
-    def dequeue(self, usage_stats):
-        """dequeue outer group schedule."""
-        for user_group_id, user_group_queue in self.queues.items():
-            if user_group_queue.empty():
-                continue
-            else:
-                return user_group_queue.dequeue(usage_stats)
-        return None
diff --git a/lmdeploy/serve/qos_engine/usage_stats.py b/lmdeploy/serve/qos_engine/usage_stats.py
deleted file mode 100644
index 05ed97cc13..0000000000
--- a/lmdeploy/serve/qos_engine/usage_stats.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import threading
-from typing import List
-
-
-class Buffer:
-    """Ring buffer for calculate tokens and reqs usage."""
-
-    def __init__(self, ts: int, user_groups: List[str]):
-        self.ts = ts
-        # Per user usage
-        self.uid_to_tokens_ps = dict()
-        self.uid_to_reqs_ps = dict()
-
-        # Per group usage
-        self.group_to_tokens_ps = dict()
-        self.group_to_reqs_ps = dict()
-
-        for group in user_groups:
-            self.group_to_tokens_ps[group] = 0
-            self.group_to_reqs_ps[group] = 0
-
-
-class UsageStats:
-    """calculate usage for qos engine for inner group schedule."""
-
-    def __init__(self, total_duration: int, buffer_count: int,
-                 start_index: int, user_groups: List[str]):
-        self.total_duration = total_duration
-        self.buffer_count = buffer_count
-        self.start_index = start_index
-        self.start_ts = int(0)
-
-        self.buffer_duration = int(total_duration / buffer_count)
-        self.circular_buffer = [
-            Buffer(self.buffer_duration * i, user_groups)
-            for i in range(buffer_count)
-        ]
-
-        self.user_groups = user_groups
-
-        self.lock = threading.Lock()
-
-    def update_usage(self, uid: str, group: str, out_token_len: int,
-                     req_ts: int):
-        """Update UsageStats when a request is returned."""
-        with self.lock:
-            intervals = int((req_ts - self.start_ts) / self.buffer_duration)
-
-            curr_idx = (self.start_index + intervals) % self.buffer_count
-            curr_ts = self.start_ts + intervals * self.buffer_duration
-
-            # Current request outside the sliding window
-            if intervals >= self.buffer_count:
-                reset_buf_cnt = intervals - self.buffer_count
-                curr_buf_ts = 0
-
-                if reset_buf_cnt >= self.buffer_count:
-                    # All buffers are reset
-                    for i in range(1, self.buffer_count):
-                        reset_idx = (curr_idx + i) % self.buffer_count
-                        self.circular_buffer[reset_idx] = Buffer(
-                            req_ts + i * self.buffer_duration,
-                            self.user_groups)
-                    # Update self.start_index
-                    self.start_index = curr_idx
-                    self.start_ts = req_ts
-                    curr_buf_ts = req_ts
-                else:
-                    # buffers between self.start_index and curr_idx are reset
-                    for i in range(reset_buf_cnt):
-                        reset_idx = (self.start_index + i) % self.buffer_count
-                        reset_ts = self.circular_buffer[
-                            reset_idx].ts + self.total_duration
-                        self.circular_buffer[reset_idx] = Buffer(
-                            reset_ts, self.user_groups)
-
-                    # Update self.start_index
-                    self.start_index = (curr_idx + 1) % self.buffer_count
-                    self.start_ts = self.circular_buffer[self.start_index].ts
-                    curr_buf_ts = self.circular_buffer[
-                        curr_idx].ts + self.total_duration
-
-                # Set corresponding buffer
-                self.circular_buffer[curr_idx] = Buffer(
-                    curr_buf_ts, self.user_groups)
-                self.circular_buffer[curr_idx].uid_to_reqs_ps[uid] = 1
-                self.circular_buffer[curr_idx].uid_to_tokens_ps[
-                    uid] = out_token_len
-                self.circular_buffer[curr_idx].group_to_reqs_ps[group] = 1
-                self.circular_buffer[curr_idx].group_to_tokens_ps[
-                    group] = out_token_len
-
-            # Otherwise update corresponding buffer
-            else:
-                self.circular_buffer[curr_idx].ts = curr_ts
-
-                if uid in self.circular_buffer[curr_idx].uid_to_reqs_ps:
-                    self.circular_buffer[curr_idx].uid_to_reqs_ps[uid] += 1
-                else:
-                    self.circular_buffer[curr_idx].uid_to_reqs_ps[uid] = 1
-
-                if uid in self.circular_buffer[curr_idx].uid_to_tokens_ps:
-                    self.circular_buffer[curr_idx].uid_to_tokens_ps[
-                        uid] += out_token_len
-                else:
-                    self.circular_buffer[curr_idx].uid_to_tokens_ps[
-                        uid] = out_token_len
-
-                self.circular_buffer[curr_idx].group_to_reqs_ps[group] += 1
-                self.circular_buffer[curr_idx].group_to_tokens_ps[
-                    group] += out_token_len
-
-    def get_user_usage(self, uid: str, group: str):
-        """Calculate usage stats of the given user and group."""
-        user_req_usage = 0
-        user_token_usage = 0
-        group_req_usage = 0
-        group_token_usage = 0
-
-        # TODO: use reader lock
-        with self.lock:
-            for i in range(self.buffer_count):
-                if uid in self.circular_buffer[i].uid_to_reqs_ps:
-                    user_req_usage += self.circular_buffer[i].uid_to_reqs_ps[
-                        uid]
-                    user_token_usage += self.circular_buffer[
-                        i].uid_to_tokens_ps[uid]
-
-                group_req_usage += self.circular_buffer[i].group_to_reqs_ps[
-                    group]
-                group_token_usage += self.circular_buffer[
-                    i].group_to_tokens_ps[group]
-
-        return (user_req_usage, user_token_usage, group_req_usage,
-                group_token_usage)

From a316aecf2f7adb7254df06e66ad2b099d4834ecd Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 14 Aug 2024 11:28:30 +0800
Subject: [PATCH 14/39] fix Windows compile error (#2303)

---
 requirements/build.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/build.txt b/requirements/build.txt
index b4430ae374..28c4a7abb0 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -1,2 +1,2 @@
-pybind11
+pybind11<=2.13.1
 setuptools

From afa54eed61eec11e2340fdc0cf7e5cf9a5d2bffb Mon Sep 17 00:00:00 2001
From: Yineng Zhang <me@zhyncs.com>
Date: Wed, 14 Aug 2024 16:01:31 +0800
Subject: [PATCH 15/39] fix: follow up #2303 (#2307)

* fix: follow up #2303

* upd
---
 .github/workflows/pypi.yml            | 3 ++-
 .github/workflows/windows-x64-gpu.yml | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 8b610d70b8..bcb992422f 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -65,7 +65,8 @@ jobs:
           python-version: ${{ matrix.pyver }}
       - name: Install python packages
         run: |
-          pip install pybind11 wheel
+          pip install -r requirements/build.txt
+          pip install wheel
       - name: Setup CUDA Toolkit
         id: cuda-toolkit
         shell: pwsh
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
index cf43cc1e28..d3339ac15f 100644
--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
@@ -40,7 +40,8 @@ jobs:
           python-version: '3.8'
       - name: Install python packages
         run: |
-          pip install pybind11 wheel
+          pip install -r requirements/build.txt
+          pip install wheel
       - name: Setup CUDA Toolkit
         id: cuda-toolkit
         shell: pwsh

From 32dc298fd20be0967bea8af2a7b881a3f6adcd41 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:55:44 +0800
Subject: [PATCH 16/39] Support send tool_calls back to internlm2 (#2147)

* Support send tool_calls back to internlm2

* update documents

* condition
---
 docs/en/llm/api_server_tools.md    | 13 +++++--------
 docs/zh_cn/llm/api_server_tools.md | 13 +++++--------
 lmdeploy/model.py                  | 12 ++++++++++--
 lmdeploy/serve/async_engine.py     |  3 ++-
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/docs/en/llm/api_server_tools.md b/docs/en/llm/api_server_tools.md
index 0a6b8f7768..56fb1b598a 100644
--- a/docs/en/llm/api_server_tools.md
+++ b/docs/en/llm/api_server_tools.md
@@ -119,14 +119,11 @@ func1_args = response.choices[0].message.tool_calls[0].function.arguments
 func1_out = eval(f'{func1_name}(**{func1_args})')
 print(func1_out)
 
+messages.append(response.choices[0].message)
 messages.append({
-    'role': 'assistant',
-    'content': response.choices[0].message.content
-})
-messages.append({
-    'role': 'environment',
+    'role': 'tool',
     'content': f'3+5={func1_out}',
-    'name': 'plugin'
+    'tool_call_id': response.choices[0].message.tool_calls[0].id
 })
 response = client.chat.completions.create(
     model=model_name,
@@ -145,9 +142,9 @@ print(func2_out)
 Using the InternLM2-Chat-7B model to execute the above example, the following results will be printed.
 
 ```
-ChatCompletion(id='1', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='1', function=Function(arguments={'a': 3, 'b': 5}, name='add'), type='function')]))], created=1719369986, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=263, total_tokens=288))
+ChatCompletion(id='1', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"a": 3, "b": 5}', name='add'), type='function')]))], created=1722852901, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=263, total_tokens=288))
 8
-ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='2', function=Function(arguments={'a': 8, 'b': 2}, name='mul'), type='function')]))], created=1719369987, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=282, total_tokens=307))
+ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"a": 8, "b": 2}', name='mul'), type='function')]))], created=1722852901, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=293, total_tokens=318))
 16
 ```
 
diff --git a/docs/zh_cn/llm/api_server_tools.md b/docs/zh_cn/llm/api_server_tools.md
index 8c44c0e43f..643a39d5d2 100644
--- a/docs/zh_cn/llm/api_server_tools.md
+++ b/docs/zh_cn/llm/api_server_tools.md
@@ -119,14 +119,11 @@ func1_args = response.choices[0].message.tool_calls[0].function.arguments
 func1_out = eval(f'{func1_name}(**{func1_args})')
 print(func1_out)
 
+messages.append(response.choices[0].message)
 messages.append({
-    'role': 'assistant',
-    'content': response.choices[0].message.content
-})
-messages.append({
-    'role': 'environment',
+    'role': 'tool',
     'content': f'3+5={func1_out}',
-    'name': 'plugin'
+    'tool_call_id': response.choices[0].message.tool_calls[0].id
 })
 response = client.chat.completions.create(
     model=model_name,
@@ -145,9 +142,9 @@ print(func2_out)
 实际使用 InternLM2-Chat-7B 模型执行上述例子，可以得到下面的结果：
 
 ```
-ChatCompletion(id='1', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='1', function=Function(arguments={'a': 3, 'b': 5}, name='add'), type='function')]))], created=1719369986, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=263, total_tokens=288))
+ChatCompletion(id='1', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='0', function=Function(arguments='{"a": 3, "b": 5}', name='add'), type='function')]))], created=1722852901, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=263, total_tokens=288))
 8
-ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='2', function=Function(arguments={'a': 8, 'b': 2}, name='mul'), type='function')]))], created=1719369987, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=282, total_tokens=307))
+ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='1', function=Function(arguments='{"a": 8, "b": 2}', name='mul'), type='function')]))], created=1722852901, model='/nvme/shared_data/InternLM/internlm2-chat-7b', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=25, prompt_tokens=293, total_tokens=318))
 16
 ```
 
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index b619fdc01c..f79d89e06b 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -485,11 +485,13 @@ def messages2prompt(self,
         box_map = dict(user=self.user,
                        assistant=self.assistant,
                        system=self.system,
-                       environment=self.environment)
+                       environment=self.environment,
+                       tool=self.environment)
         eox_map = dict(user=self.eoh,
                        assistant=self.eoa + self.separator,
                        system=self.eosys,
-                       environment=self.eoenv)
+                       environment=self.eoenv,
+                       tool=self.eoenv)
         name_map = dict(plugin=self.plugin, interpreter=self.interpreter)
         ret = ''
         if self.meta_instruction is not None and sequence_start:
@@ -508,6 +510,12 @@ def messages2prompt(self,
         for message in messages:
             role = message['role']
             content = message['content']
+            if role == 'assistant' and message.get('tool_calls',
+                                                   None) is not None:
+                for tool_call in message['tool_calls']:
+                    function = tool_call.get('function', {})
+                    function['arguments'] = function.pop('parameters', {})
+                    content += f'<|action_start|><|plugin|>\n{json.dumps(function)}<|action_end|>'
             if 'name' in message and message['name'] in name_map:
                 begin = box_map[role].strip(
                 ) + f" name={name_map[message['name']]}\n"
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index f9c7d969dc..47ff4e083e 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -627,7 +627,8 @@ def parse_tool_response(self, text, tools, **kwargs):
             action = action.split('<|action_end|>'.strip())[0]
             action = action[action.find('{'):]
             action = json.loads(action)
-            name, parameters = action['name'], json.dumps(action['parameters'])
+            name, parameters = action['name'], json.dumps(
+                action.get('parameters', action.get('arguments', {})))
         elif '<function=' in text:  # llama3.1
             action, _ = text.split('</function>')
             parameters = action[action.find('{'):]

From c1923f4c34d23fcca52ae8e49c5db0417327d985 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Thu, 15 Aug 2024 14:18:36 +0800
Subject: [PATCH 17/39] Fix internvl2 template and update docs (#2292)

* fix template

* add max_dynamic_patch custom setting

* fix test

* update docs

* update docs

* remove unnecessary process

* update link
---
 README.md                               |   2 +-
 README_zh-CN.md                         |   2 +-
 docs/en/multi_modal/internvl.md         | 155 +++++++++++++++++++++++-
 docs/zh_cn/multi_modal/internvl.md      | 155 +++++++++++++++++++++++-
 lmdeploy/model.py                       |   6 +
 lmdeploy/serve/vl_async_engine.py       |   1 -
 lmdeploy/vl/model/internvl.py           |   6 +-
 tests/test_lmdeploy/test_model.py       |   2 +-
 tests/test_lmdeploy/test_vl_template.py |  23 ++++
 9 files changed, 342 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 1c7f368ee3..03b3e160bc 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
 - \[2024/07\] 🎉🎉 Support Llama3.1 8B, 70B and its TOOLS CALLING
-- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](docs/en/multi_modal/internvl.md) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
 - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2
diff --git a/README_zh-CN.md b/README_zh-CN.md
index f82e299c37..1bac57543c 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift)，成为 VLMs 推理的默认加速引擎
 - \[2024/07\] 🎉🎉 支持 Llama3.1 8B 和 70B 模型，以及工具调用功能
-- \[2024/07\] 支持 [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) 全系列模型，[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md)
+- \[2024/07\] 支持 [InternVL2](docs/zh_cn/multi_modal/internvl.md) 全系列模型，[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md)
 - \[2024/06\] PyTorch engine 支持了 DeepSeek-V2 和若干 VLM 模型推理, 比如 CogVLM2，Mini-InternVL，LlaVA-Next
 - \[2024/05\] 在多 GPU 上部署 VLM 模型时，支持把视觉部分的模型均分到多卡上
 - \[2024/05\] 支持InternVL v1.5, LLaVa, InternLMXComposer2 等 VLMs 模型的 4bit 权重量化和推理
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index 0b204cb920..a43f265ae0 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -1,3 +1,154 @@
-# InternVL
+# InternVL2
 
-TODO
+## Introduction
+
+InternVL is an open source vision-language base model that expands the Vision Transformer (ViT) to 600 million parameters and aligns with the Large Language Model (LLM). It is the largest open-source vision/vision-language foundation model (14B) to date, achieving 32 state-of-the-art performance on a wide range of tasks such as visual perception, cross-modal retrieval, multimodal dialogue, etc. LMDeploy supports InternVL series of models. The following uses InternVL2-8B as an example to demonstrate its usage.
+
+## Quick Start
+
+### Installation
+
+Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternVL2 needs
+
+```shell
+pip install timm
+```
+
+### Offline inference pipeline
+
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('OpenGVLab/InternVL2-8B')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+## More examples
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, combined images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl import load_image
+from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl.constants import IMAGE_TOKEN
+
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=f'<img>{IMAGE_TOKEN}{IMAGE_TOKEN}</img>\nDescribe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>multi-image multi-round conversation, separate images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl import load_image
+from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl.constants import IMAGE_TOKEN
+
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=f'Image-1: <img>{IMAGE_TOKEN}</img>\nImage-2: <img>{IMAGE_TOKEN}</img>\nDescribe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>video multi-round conversation</b>
+  </summary>
+
+```python
+import numpy as np
+from lmdeploy import pipeline, GenerationConfig
+from decord import VideoReader, cpu
+from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.utils import encode_image_base64
+from PIL import Image
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+
+
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([
+        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+        for idx in range(num_segments)
+    ])
+    return frame_indices
+
+
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    imgs = []
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+        imgs.append(img)
+    return imgs
+
+
+video_path = 'red-panda.mp4'
+imgs = load_video(video_path, num_segments=8, max_num=1)
+
+question = ''
+for i in range(len(imgs)):
+    question = question + f'Frame{i+1}: <img>{IMAGE_TOKEN}</img>\n'
+
+question += 'What is the red panda doing?'
+
+content = [{'type': 'text', 'text': question}]
+for img in imgs:
+    content.append({'type': 'image_url', 'image_url': {'max_dynamic_patch': 1, 'url': f'data:image/jpeg;base64,{encode_image_base64(img)}'}})
+
+messages = [dict(role='user', content=content)]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='Describe this video in detail. Don\'t repeat.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index 0b204cb920..01798d052e 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -1,3 +1,154 @@
-# InternVL
+# InternVL2
 
-TODO
+## 简介
+
+InternVL是一个开源的视觉语言基础模型，它将Vision Transformer（ViT）扩展至6亿参数，并与大型语言模型（LLM）对齐。作为目前最大的开源视觉/视觉语言基础模型（14亿参数），InternVL在视觉感知、跨模态检索、多模态对话等多个任务上实现了32项最先进的性能。LMDeploy 支持了 InternVL 系列模型的推理。下面以 InternVL2-8B 为例，展示其使用方法。
+
+## 快速开始
+
+### 安装
+
+请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游模型库 InternVL2 所需的依赖。
+
+```shell
+pip install timm
+```
+
+### 离线推理 pipeline
+
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('OpenGVLab/InternVL2-8B')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe((f'describe this image', image))
+print(response)
+```
+
+## 更多使用例子
+
+<details>
+  <summary>
+    <b>多图多轮对话，拼接图像</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl import load_image
+from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl.constants import IMAGE_TOKEN
+
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=f'<img>{IMAGE_TOKEN}{IMAGE_TOKEN}</img>\nDescribe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>多图多轮对话，独立图像</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl import load_image
+from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl.constants import IMAGE_TOKEN
+
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=f'Image-1: <img>{IMAGE_TOKEN}</img>\nImage-2: <img>{IMAGE_TOKEN}</img>\nDescribe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_dynamic_patch=12, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>视频多轮对话</b>
+  </summary>
+
+```python
+import numpy as np
+from lmdeploy import pipeline, GenerationConfig
+from decord import VideoReader, cpu
+from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.vl.utils import encode_image_base64
+from PIL import Image
+pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
+
+
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([
+        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+        for idx in range(num_segments)
+    ])
+    return frame_indices
+
+
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    imgs = []
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+        imgs.append(img)
+    return imgs
+
+
+video_path = 'red-panda.mp4'
+imgs = load_video(video_path, num_segments=8, max_num=1)
+
+question = ''
+for i in range(len(imgs)):
+    question = question + f'Frame{i+1}: <img>{IMAGE_TOKEN}</img>\n'
+
+question += 'What is the red panda doing?'
+
+content = [{'type': 'text', 'text': question}]
+for img in imgs:
+    content.append({'type': 'image_url', 'image_url': {'max_dynamic_patch': 1, 'url': f'data:image/jpeg;base64,{encode_image_base64(img)}'}})
+
+messages = [dict(role='user', content=content)]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='Describe this video in detail. Don\'t repeat.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+```
+
+</details>
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index f79d89e06b..9083463de1 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -558,9 +558,15 @@ class InternVL2InternLM2(InternLM2Chat7B):
     def __init__(
             self,
             meta_instruction='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+            eosys='<|im_end|>',
+            eoh='<|im_end|>',
+            separator='',
             stop_words=['<|im_start|>', '<|im_end|>'],
             **kwargs):
         super().__init__(meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         separator=separator,
+                         eoh=eoh,
                          stop_words=stop_words,
                          **kwargs)
 
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index 09c05780fb..f8e707e5c6 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -124,7 +124,6 @@ def __call__(self, prompts: Union[VLPromptType, List[Dict],
                                       List[VLPromptType], List[List[Dict]]],
                  **kwargs):
         """Inference a batch of prompts."""
-        prompts = self._convert_prompts(prompts)
         return super().__call__(prompts, **kwargs)
 
     def chat(self, prompts: VLPromptType, **kwargs):
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index 9f8bffe3e6..d85fe30939 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -141,8 +141,10 @@ def _preprocess_v1_5(self, images: List[Image], params: List[Dict] = None):
 
         outputs = []
         for image, param in zip(images, params):
-            res_key = param.get('detail', 'default')
-            max_num = image_res.get(res_key, self.config.max_dynamic_patch)
+            max_num = param.get('max_dynamic_patch')
+            if max_num is None or not isinstance(max_num, int):
+                res_key = param.get('detail', 'default')
+                max_num = image_res.get(res_key, self.config.max_dynamic_patch)
             out = dynamic_preprocess(
                 image,
                 min_num=self.config.min_dynamic_patch,
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index f779387852..4aa0ca43fe 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -469,7 +469,7 @@ def test_internvl2():
     }]
     expected = '<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的'\
         '书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'\
-        '<|im_end|>\n<|im_start|>user\nwho are you<|im_end|>\n<|im_start|>'\
+        '<|im_end|><|im_start|>user\nwho are you<|im_end|><|im_start|>'\
         'assistant\nI am an AI'
     res = model.messages2prompt(messages)
     assert res == expected
diff --git a/tests/test_lmdeploy/test_vl_template.py b/tests/test_lmdeploy/test_vl_template.py
index d99f9b8209..cf8abf9e44 100644
--- a/tests/test_lmdeploy/test_vl_template.py
+++ b/tests/test_lmdeploy/test_vl_template.py
@@ -46,6 +46,29 @@ def test_messages2prompt():
     assert prompt == expected
 
 
+def test_internvl2_conv():
+    # https://huggingface.co/OpenGVLab/InternVL2-8B/blob/3bfd3664dea4f3da628785f5125d30f889701253/conversation.py
+    from transformers.dynamic_module_utils import get_class_from_dynamic_module
+    get_conv_template = get_class_from_dynamic_module(
+        'conversation.get_conv_template', 'OpenGVLab/InternVL2-8B')
+    template = get_conv_template('internlm2-chat')
+    question1 = 'question1'
+    template.append_message(template.roles[0], question1)
+    template.append_message(template.roles[1], None)
+    model = MODELS.get('internvl2-internlm2')()
+    messages = [dict(role='user', content=question1)]
+    assert template.get_prompt() == model.messages2prompt(messages)
+
+    answer1 = 'answer1'
+    template.messages[-1][1] = answer1
+    question2 = 'question2'
+    template.append_message(template.roles[0], question2)
+    template.append_message(template.roles[1], None)
+    messages.append(dict(role='assistant', content=answer1))
+    messages.append(dict(role='user', content=question2))
+    assert template.get_prompt() == model.messages2prompt(messages)
+
+
 def test_llava_conv_chatml_direct():
     model = MODELS.get('llava-chatml')()
     templtae = VLChatTemplateWrapper(model)

From 3b0fd0de034bbc5be3f8425ca469c3deaa571b0b Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:14:22 +0800
Subject: [PATCH 18/39] Add stream options to control usage (#2313)

---
 lmdeploy/serve/openai/api_server.py | 29 ++++++++++++++++++++---------
 lmdeploy/serve/openai/protocol.py   | 10 ++++++++++
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 301ff76763..1b64f877cc 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -347,11 +347,11 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         adapter_name=adapter_name,
     )
 
-    def create_stream_response_json(
-            index: int,
-            text: str,
-            finish_reason: Optional[str] = None,
-            logprobs: Optional[LogProbs] = None) -> str:
+    def create_stream_response_json(index: int,
+                                    text: str,
+                                    finish_reason: Optional[str] = None,
+                                    logprobs: Optional[LogProbs] = None,
+                                    usage: Optional[UsageInfo] = None) -> str:
         choice_data = ChatCompletionResponseStreamChoice(
             index=index,
             delta=DeltaMessage(role='assistant', content=text),
@@ -362,6 +362,7 @@ def create_stream_response_json(
             created=created_time,
             model=model_name,
             choices=[choice_data],
+            usage=usage,
         )
         response_json = response.model_dump_json()
 
@@ -369,17 +370,27 @@ def create_stream_response_json(
 
     async def completion_stream_generator() -> AsyncGenerator[str, None]:
         async for res in result_generator:
-            logprobs = None
+            logprobs, usage = None, None
             if gen_logprobs and res.logprobs:
                 logprobs = _create_chat_completion_logprobs(
                     VariableInterface.async_engine.tokenizer, res.token_ids,
                     res.logprobs)
-
+            if request.stream_options and request.stream_options.include_usage:
+                total_tokens = sum([
+                    res.history_token_len, res.input_token_len,
+                    res.generate_token_len
+                ])
+                usage = UsageInfo(
+                    prompt_tokens=res.input_token_len,
+                    completion_tokens=res.generate_token_len,
+                    total_tokens=total_tokens,
+                )
             response_json = create_stream_response_json(
                 index=0,
                 text=res.response,
                 finish_reason=res.finish_reason,
-                logprobs=logprobs)
+                logprobs=logprobs,
+                usage=usage)
             yield f'data: {response_json}\n\n'
         yield 'data: [DONE]\n\n'
 
@@ -583,7 +594,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                         res.token_ids, res.logprobs,
                         gen_config.skip_special_tokens, offset, all_token_ids,
                         state)
-                if res.finish_reason is not None:
+                if request.stream_options and request.stream_options.include_usage:  # noqa E501
                     final_res = res
                     total_tokens = sum([
                         final_res.history_token_len, final_res.input_token_len,
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 59a2a3968a..b1f83139e7 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -80,6 +80,11 @@ class ToolChoice(BaseModel):
                                       examples=['function'])
 
 
+class StreamOptions(BaseModel):
+    """The stream options."""
+    include_usage: Optional[bool] = False
+
+
 class ChatCompletionRequest(BaseModel):
     """Chat completion request."""
     model: str
@@ -96,6 +101,8 @@ class ChatCompletionRequest(BaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None])  # noqa
     # yapf: enable
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = Field(default=None,
+                                                    examples=[None])
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
@@ -205,6 +212,8 @@ class CompletionRequest(BaseModel):
     stop: Optional[Union[str, List[str]]] = Field(default=None,
                                                   examples=[None])
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = Field(default=None,
+                                                    examples=[None])
     top_p: Optional[float] = 1.0
     logprobs: Optional[int] = None
     echo: Optional[bool] = False
@@ -252,6 +261,7 @@ class CompletionStreamResponse(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
 
 
 class EmbeddingsRequest(BaseModel):

From 26c00ab3c43d0dffc3d3626e9fa7a03147197b61 Mon Sep 17 00:00:00 2001
From: john tong <40350896+ColorfulDick@users.noreply.github.com>
Date: Mon, 19 Aug 2024 11:05:20 +0800
Subject: [PATCH 19/39] fix the issue missing dependencies in the Dockerfile
 and pip (#2240)

* fix the issue missing dependencies in the Dockerfile and pip

* reset dependencies

* reset compose images tag

* add InternVL_Dockerfile

* nvidia/cuda image should be as low as possible,now as of nccl2.22.3,the minimum supported CUDA version if 12.2; if nvidia device version on host machine higher than images,torch in images will not work

* remove a line

* fix the apt error  in InternVL_Dockerfile

* apt add -y

* change InternVL_Dockerfile base image tag

* roll back .\requirements\test.txt to 7c4e75b53a8c

* add rust build tools in Dockerfile to fix the bug when gradio>4.40.0,which depend on orjson should build with cargo

* run pre-commit to fix;and add internvl docs

* move rust install command to where installing sys packages

* fix internVL docs layout

* remove tritonclient[grpc] from serve.txt

* fix docs about InternVL

* fix en docs about internVL markdown format

* fix en docs about internVL markdown format

* update

* update docs of internVL by H.Lyu

* fix supported inference engine info for InternVL

* remove nccl installation since it is already in the docker image

* remove nccl install and chage base image tag

* change base image to 12.4.1

---------

Co-authored-by: lvhan028 <lvhan_028@163.com>
---
 docker/Dockerfile                  |   6 +-
 docker/InternVL_Dockerfile         |  13 ++++
 docs/en/multi_modal/internvl.md    | 115 ++++++++++++++++++++++++----
 docs/zh_cn/multi_modal/internvl.md | 118 +++++++++++++++++++++++++----
 requirements/runtime.txt           |   1 +
 5 files changed, 221 insertions(+), 32 deletions(-)
 create mode 100644 docker/InternVL_Dockerfile

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 45f30d59a0..677597f85c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,7 +13,8 @@ ARG PYTHON_VERSION=3.10
 ARG TORCH_VERSION=2.3.0
 ARG TORCHVISION_VERSION=0.18.0
 
-RUN rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
+RUN apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
+    curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
     add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
     ninja-build rapidjson-dev libgoogle-glog-dev gdb python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
     && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
@@ -46,7 +47,8 @@ RUN cd /opt/lmdeploy &&\
     ninja -j$(nproc) && ninja install &&\
     cd .. &&\
     python3 -m pip install -e . &&\
-    rm -rf build
+    rm -rf build  &&\
+    rm -rf ~/.cache/*
 
 ENV LD_LIBRARY_PATH=/opt/lmdeploy/install/lib:$LD_LIBRARY_PATH
 ENV PATH=/opt/lmdeploy/install/bin:$PATH
diff --git a/docker/InternVL_Dockerfile b/docker/InternVL_Dockerfile
new file mode 100644
index 0000000000..2652845d92
--- /dev/null
+++ b/docker/InternVL_Dockerfile
@@ -0,0 +1,13 @@
+ARG CUDA_VERSION=cu12
+
+FROM openmmlab/lmdeploy:latest-cu12 AS cu12
+ENV CUDA_VERSION_SHORT=cu123
+
+FROM openmmlab/lmdeploy:latest-cu11 AS cu11
+ENV CUDA_VERSION_SHORT=cu118
+
+FROM ${CUDA_VERSION} AS final
+
+RUN python3 -m pip install timm
+
+RUN python3 -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+${CUDA_VERSION_SHORT}torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index a43f265ae0..8f0f81387d 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -1,22 +1,43 @@
-# InternVL2
+# InternVL
 
-## Introduction
+LMDeploy supports the following InternVL series of models, which are detailed in the table below:
 
-InternVL is an open source vision-language base model that expands the Vision Transformer (ViT) to 600 million parameters and aligns with the Large Language Model (LLM). It is the largest open-source vision/vision-language foundation model (14B) to date, achieving 32 state-of-the-art performance on a wide range of tasks such as visual perception, cross-modal retrieval, multimodal dialogue, etc. LMDeploy supports InternVL series of models. The following uses InternVL2-8B as an example to demonstrate its usage.
+|    Model    |    Size    | Supported Inference Engine |
+| :---------: | :--------: | :------------------------: |
+|  InternVL   |  13B-19B   |         TurboMind          |
+| InternVL1.5 |   2B-26B   |     TurboMind, PyTorch     |
+|  InternVL2  |   1B, 4B   |          PyTorch           |
+|  InternVL2  | 2B, 8B-76B |     TurboMind, PyTorch     |
 
-## Quick Start
+The next chapter demonstrates how to deploy an InternVL model using LMDeploy, with [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B) as an example.
 
-### Installation
+## Installation
 
 Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternVL2 needs
 
 ```shell
 pip install timm
+# It is recommended to find the whl package that matches the environment from the releases on https://github.com/Dao-AILab/flash-attention.
+pip install flash-attn
 ```
 
-### Offline inference pipeline
+Or, you can build a docker image to set up the inference environment. If the CUDA version on your host machine is `>=12.4`, you can run:
 
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+```
+docker build --build-arg CUDA_VERSION=cu12 -t openmmlab/lmdeploy:internvl . -f ./docker/InternVL_Dockerfile
+```
+
+Otherwise, you can go with:
+
+```shell
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+docker build --build-arg CUDA_VERSION=cu11 -t openmmlab/lmdeploy:internvl . -f ./docker/InternVL_Dockerfile
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
@@ -29,7 +50,7 @@ response = pipe((f'describe this image', image))
 print(response)
 ```
 
-## More examples
+More examples are listed below:
 
 <details>
   <summary>
@@ -38,8 +59,6 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
-from lmdeploy.vl.utils import encode_image_base64
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
@@ -66,8 +85,6 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
-from lmdeploy.vl.utils import encode_image_base64
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
@@ -117,7 +134,7 @@ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     return frame_indices
 
 
-def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+def load_video(video_path, bound=None, num_segments=32):
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     max_frame = len(vr) - 1
     fps = float(vr.get_avg_fps())
@@ -131,7 +148,7 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
 
 
 video_path = 'red-panda.mp4'
-imgs = load_video(video_path, num_segments=8, max_num=1)
+imgs = load_video(video_path, num_segments=8)
 
 question = ''
 for i in range(len(imgs)):
@@ -152,3 +169,73 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 ```
 
 </details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server OpenGVLab/InternVL2-8B
+```
+
+You can also start the service using the aforementioned built docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:internvl \
+    lmdeploy serve api_server OpenGVLab/InternVL2-8B
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:internvl
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server OpenGVLab/InternVL2-8B
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index 01798d052e..c51870d6b2 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -1,22 +1,43 @@
-# InternVL2
+# InternVL
 
-## 简介
+LMDeploy 支持 InternVL 系列模型，具体如下：
 
-InternVL是一个开源的视觉语言基础模型，它将Vision Transformer（ViT）扩展至6亿参数，并与大型语言模型（LLM）对齐。作为目前最大的开源视觉/视觉语言基础模型（14亿参数），InternVL在视觉感知、跨模态检索、多模态对话等多个任务上实现了32项最先进的性能。LMDeploy 支持了 InternVL 系列模型的推理。下面以 InternVL2-8B 为例，展示其使用方法。
+|    Model    |    Size    | Supported Inference Engine |
+| :---------: | :--------: | :------------------------: |
+|  InternVL   |  13B-19B   |         TurboMind          |
+| InternVL1.5 |   2B-26B   |     TurboMind, PyTorch     |
+|  InternVL2  |   1B, 4B   |          PyTorch           |
+|  InternVL2  | 2B, 8B-76B |     TurboMind, PyTorch     |
 
-## 快速开始
+本文将以[InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)为例，演示使用 LMDeploy 部署 InternVL 系列模型的方法
 
-### 安装
+## 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游模型库 InternVL2 所需的依赖。
+请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游 InternVL 模型库需的依赖。
 
 ```shell
 pip install timm
+# 建议从https://github.com/Dao-AILab/flash-attention/releases寻找和环境匹配的whl包
+pip install flash-attn
 ```
 
-### 离线推理 pipeline
+或者，你可以为 InternVL 的推理构建 docker image。如果，宿主机器上的 CUDA 版本 `>=12.4`，你可以执行如下命令构建镜像：
 
-以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+```
+git clone https://github.com/InternLM/lmdeploy.git
+cd lmdeploy
+docker build --build-arg CUDA_VERSION=cu12 -t openmmlab/lmdeploy:internvl . -f ./docker/InternVL_Dockerfile
+```
+
+否则的话，可以基于 LMDeploy cu11 的镜像来构建：
+
+```shell
+docker build --build-arg CUDA_VERSION=cu11 -t openmmlab/lmdeploy:internvl . -f ./docker/InternVL_Dockerfile
+```
+
+## 离线推理
+
+以下是使用 pipeline 进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
@@ -29,7 +50,7 @@ response = pipe((f'describe this image', image))
 print(response)
 ```
 
-## 更多使用例子
+更多例子如下：
 
 <details>
   <summary>
@@ -38,8 +59,6 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
-from lmdeploy.vl.utils import encode_image_base64
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
@@ -66,8 +85,6 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
-from lmdeploy.vl.utils import encode_image_base64
 from lmdeploy.vl.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
@@ -117,11 +134,10 @@ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     return frame_indices
 
 
-def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+def load_video(video_path, bound=None, num_segments=32):
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     max_frame = len(vr) - 1
     fps = float(vr.get_avg_fps())
-    pixel_values_list, num_patches_list = [], []
     frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
     imgs = []
     for frame_index in frame_indices:
@@ -131,7 +147,7 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
 
 
 video_path = 'red-panda.mp4'
-imgs = load_video(video_path, num_segments=8, max_num=1)
+imgs = load_video(video_path, num_segments=8)
 
 question = ''
 for i in range(len(imgs)):
@@ -152,3 +168,73 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 ```
 
 </details>
+
+## 在线服务
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server OpenGVLab/InternVL2-8B
+```
+
+也可以基于前文构建的 docker image 启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:internvl \
+    lmdeploy serve api_server OpenGVLab/InternVL2-8B
+```
+
+Docker compose 的方式也是一种选择。在 LMDeploy 代码库的根目录下创建`docker-compose.yml`文件，内容参考如下：
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:internvl
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server OpenGVLab/InternVL2-8B
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+然后，你就可以执行命令启动服务了：
+
+```shell
+docker-compose up -d
+```
+
+通过`docker logs -f lmdeploy`可以查看启动的日志信息，如果发现类似下方的日志信息，就表明服务启动成功了。
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+有关 `lmdeploy serve api_server` 的详细参数可以通过`lmdeploy serve api_server -h`查阅。
+
+关于 `api_server` 更多的介绍，以及访问 `api_server` 的方法，请阅读[此处](api_server_vl.md)
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 9564e7e89d..c6a1e74444 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -4,6 +4,7 @@ fastapi
 fire
 mmengine-lite
 numpy<2.0.0
+openai
 peft<=0.11.1
 pillow
 protobuf

From 219725078f8f83ab0506824fd0cce3a08e7f6b69 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Mon, 19 Aug 2024 11:57:22 +0800
Subject: [PATCH 20/39] add device type for pytorch engine in cli(#2321)

---
 lmdeploy/cli/cli.py        | 5 +++--
 lmdeploy/cli/serve.py      | 6 ++++--
 lmdeploy/cli/utils.py      | 9 +++++++++
 lmdeploy/turbomind/chat.py | 1 -
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 4a98d830d7..f33c276d5d 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -111,7 +111,7 @@ def add_parser_chat():
         # pytorch engine args
         pt_group = parser.add_argument_group('PyTorch engine arguments')
         ArgumentHelper.adapters(pt_group)
-
+        ArgumentHelper.device_type(pt_group)
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
@@ -250,7 +250,7 @@ def chat(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-            )
+                device_type=args.device_type)
             run_chat(args.model_path,
                      engine_config,
                      chat_template_config=chat_template_config)
@@ -259,6 +259,7 @@ def chat(args):
             kwargs = convert_args(args)
             kwargs.pop('chat_template')
             kwargs.pop('backend')
+            kwargs.pop('device_type')
             kwargs['chat_template_config'] = chat_template_config
             run_chat(**kwargs)
 
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 85a1a13dad..d615b815b9 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -58,6 +58,7 @@ def add_parser_gradio():
 
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
+        ArgumentHelper.device_type(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -143,6 +144,7 @@ def add_parser_api_server():
         pt_group = parser.add_argument_group('PyTorch engine arguments')
 
         ArgumentHelper.adapters(pt_group)
+        ArgumentHelper.device_type(pt_group)
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
@@ -209,7 +211,7 @@ def gradio(args):
                 block_size=args.cache_block_seq_len,
                 session_len=args.session_len,
                 enable_prefix_caching=args.enable_prefix_caching,
-            )
+                device_type=args.device_type)
         else:
             backend_config = TurbomindEngineConfig(
                 tp=args.tp,
@@ -252,7 +254,7 @@ def api_server(args):
                 session_len=args.session_len,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-            )
+                device_type=args.device_type)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 468f10115e..00eb7b8cf7 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -452,3 +452,12 @@ def vision_max_batch_size(parser):
                                    type=int,
                                    default=1,
                                    help='the vision model batch size')
+
+    @staticmethod
+    def device_type(parser, default: str = 'cuda'):
+        return parser.add_argument(
+            '--device-type',
+            type=str,
+            default=default,
+            choices=['cuda', 'ascend'],
+            help='The inference device type for pytorch engine.')
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 2cb6b2c019..ba488b77a4 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -137,7 +137,6 @@ def main(model_path: str,
             if model.capability == 'chat':
                 sequence_start = (nth_round == 1)
                 sequence_end = False
-                step = step
             else:
                 sequence_start = True
                 sequence_end = True

From b28a1d048491b9ffd6d1bff48a424d40622ae147 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Mon, 19 Aug 2024 14:15:36 +0800
Subject: [PATCH 21/39] New GEMM kernels for weight-only quantization (#2090)

* preprocess for kv-int8

* working kv-int8

* minor

* working kv-int4

* optimize kv-int4

* optimize kv-int4

* optimized SIMT f16/u8/u4 decoding

* fix tc decoding

* int8 tc decoding

* int4 tc decoding

* minor

* optimize

* optimize tc kv-int4/int8

* fix `sm_75`/`sm_70`

* simplify

* bf16+kv4/8

* support more mma instruction

* refactor

* dispatching

* integration

* remove offline kv params

* fix msvc build

* fix msvc build

* fix lint

* fix lint

* fix cmake

* fix lint

* fix lint

* minor

* refactor

* gemm baseline

* optimize

* minor

* tb swizzle

* minor

* tune

* minor

* wip

* minor

* fp16 transcription

* optimize

* tune

* adjust layout

* optimize

* tune

* refactor

* refactor

* f16xs4/8 gemm

* refactor

* dequant

* fix Q

* fix Q

* end-to-end test

* optimize Q

* pack Q

* tune

* split-k

* sliced-k

* fix Q

* add `transpose_m8n8_b32`

* tune gemm

* predicate support

* tune

* dispatch

* dispatch v2

* automatic tuning

* nvbench

* better API

* GPU metrics

* update cost model

* add simt impl

* add 16816

* add 884

* refactor

* smem copy

* minor

* NT & NN

* transformation

* refactor

* refactor

* add UV

* refactor testbed

* working fp16 packing

* update

* use `(m, k)` & `(n, k)`

* simplify

* dispatch for conversion

* refactor

* refactor

* refactor

* simplify

* refactor quantization

* quantization

* fix `pack_cnt_m/k`

* `uint8_t`

* `uint4_t`

* symmetry

* refactor

* large pack

* fix `SmemCopy` for packed inputs

* tune

* SIMT

* SIMT packing

* SIMT int8

* SIMT int4

* fix group size

* mma.m8n8k4

* clean-up

* refactor epilogue

* fix smem layout for C

* tune epilogue

* TN

* optimize

* fix `_src_step_k`

* use raked partition

* fix `Tiled_MMA_v2` & optimize smem copy

* working w4a16

* add missing

* fuse up and gate

* fused silu

* `sm75` and `sm70`

* cache policy

* remove unused

* col major output

* fix tiling of C

* wip

* wip

* wip

* fix iterator

* update

* update kernel signature

* fix packing

* update

* refactor

* update

* update

* update

* alpha beta

* set beta

* fix & clean-up

* check max splits & add qwen

* add tp

* refactor `LlamaLinear`

* share linear layer

* tuning interface

* update

* skip nvbench for MSVC

* define `uint` when needed

* fix

* fix

* fix

* update

* disable large kernels

* fix

* refactor model conversion

* fix lint

* simplify target model

* refactor model import

* minor

* pad `inter_size` for tp

* refactor

* skip `sm_80` and `sm_90` on MSVC

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix cu12 / sm90 build

* fix

* fix ut

* fix missing include

* support GPTQ models

* fix ut

* parse tuning args

* minor

* minor

* add missing header

* add missing headers

* fix converter

* fix internvl reader initializer

* fix

* tuning

* remove unused

* tuning

* minor

* fix lint

* fix lint

* fix lint

* minor

* fix lint

* fix baichuan2-7b, deepseek-vl and xcomposer2d5-4bit

* tune sm_70

* optimize sm70 & fix converter

* optimize v100

* fix lint

* RTX 4090

* fix lint

* refactor & batch_dim support

* A100

* `TuningParams`

* lint

* lint

* minor

* switch to m-major MMA for sm70

* recognize GPTQ models

* RTX 2080 & GTX 1660

* fix missing return

* fix cu12 build for sm90

* fix ptr of operand C

* disable cache eviction policy on sm_90

* fix lint

* add refs

* fix lint

* lint
---
 .github/workflows/lint.yml                    |   2 +-
 .pre-commit-config.yaml                       |   2 +-
 CMakeLists.txt                                |   4 +-
 lmdeploy/cli/utils.py                         |   2 +-
 lmdeploy/turbomind/deploy/converter.py        |  41 +-
 lmdeploy/turbomind/deploy/exporter.py         | 211 ++++++
 lmdeploy/turbomind/deploy/policy.py           |  55 ++
 .../turbomind/deploy/source_model/__init__.py |   9 +-
 .../turbomind/deploy/source_model/baichuan.py |  47 +-
 .../deploy/source_model/baichuan_awq.py       |  89 ---
 .../turbomind/deploy/source_model/base.py     |   9 +
 .../deploy/source_model/deepseek_vl.py        |  87 +--
 .../deploy/source_model/deepseek_vl_awq.py    |  53 --
 .../turbomind/deploy/source_model/glm4.py     |  44 +-
 .../turbomind/deploy/source_model/glm4_awq.py |  93 ---
 .../deploy/source_model/internlm2.py          | 119 +---
 .../turbomind/deploy/source_model/internvl.py |  76 +-
 .../turbomind/deploy/source_model/llama.py    |  34 +-
 .../deploy/source_model/llama_awq.py          |  69 --
 .../turbomind/deploy/source_model/minicpmv.py |   5 +-
 .../deploy/source_model/minicpmv_awq.py       |  20 -
 .../turbomind/deploy/source_model/qwen.py     |  84 +--
 .../turbomind/deploy/source_model/qwen_awq.py |  99 ---
 .../deploy/source_model/xcomposer2.py         |  62 +-
 .../deploy/source_model/xcomposer2_awq.py     |  79 ---
 .../turbomind/deploy/target_model/__init__.py |   3 -
 .../turbomind/deploy/target_model/base.py     |  50 +-
 lmdeploy/turbomind/deploy/target_model/fp.py  |  70 +-
 .../turbomind/deploy/target_model/plora.py    | 126 ----
 .../turbomind/deploy/target_model/plora_w4.py | 146 ----
 lmdeploy/turbomind/deploy/target_model/w4.py  | 166 -----
 lmdeploy/turbomind/turbomind.py               |  23 +-
 src/turbomind/kernels/CMakeLists.txt          |   3 +-
 src/turbomind/kernels/activation_kernels.cu   |  60 ++
 src/turbomind/kernels/activation_kernels.h    |   4 +
 .../kernels/attention/attention_universal.h   |  11 +-
 src/turbomind/kernels/attention/block.h       |  14 +-
 .../kernels/attention/decoding_template.h     |   2 +-
 src/turbomind/kernels/attention/impl_16816.h  |  13 +-
 src/turbomind/kernels/attention/impl_1688.h   |  11 +-
 src/turbomind/kernels/attention/impl_81616.h  |  11 +-
 src/turbomind/kernels/attention/impl_884.h    |  53 +-
 src/turbomind/kernels/attention/impl_m16n8.h  |   2 +-
 src/turbomind/kernels/attention/impl_simt.h   |  10 +-
 src/turbomind/kernels/attention/iterator.h    |  26 +-
 .../kernels/attention/iterator_sm70.h         |   2 +-
 .../kernels/attention/iterator_sm80.h         |   2 +-
 .../kernels/attention/kv_cache_utils_v2.cu    |  12 +-
 .../kernels/attention/kv_cache_utils_v2.h     |   2 +-
 .../kernels/attention/mainloop_sm80.h         |   2 +-
 .../kernels/attention/quantization.h          |  99 ++-
 src/turbomind/kernels/attention/reduce.h      |   5 +
 .../kernels/attention/reduce_kernel.h         |   6 +-
 src/turbomind/kernels/attention/reference.cu  |   3 +-
 .../kernels/attention/rotary_embedding.h      | 188 +++++
 src/turbomind/kernels/attention/test_quant.cu |   3 +-
 src/turbomind/kernels/core/array.h            | 143 ++++
 .../kernels/{attention => core}/array_ops.h   | 330 +++------
 src/turbomind/kernels/core/common.h           |  60 ++
 .../kernels/{attention => core}/data_type.h   |  67 +-
 .../smem_layout.h => core/layout.h}           |  34 +-
 src/turbomind/kernels/core/math.h             |  37 +
 src/turbomind/kernels/core/meta.h             |  52 ++
 src/turbomind/kernels/core/mma.h              | 211 ++++++
 src/turbomind/kernels/core/pipe_iter.h        |  25 +
 src/turbomind/kernels/core/smem.h             | 106 +++
 src/turbomind/kernels/core/sub_byte_ptr.h     |  51 ++
 src/turbomind/kernels/core/sync.h             |  53 ++
 .../kernels/{attention => core}/thread_map.h  |   2 +-
 src/turbomind/kernels/gemm/CMakeLists.txt     |  64 ++
 src/turbomind/kernels/gemm/arch.h             |  49 ++
 src/turbomind/kernels/gemm/arch/config_simt.h |  92 +++
 .../kernels/gemm/arch/config_sm70_s884.h      |  83 +++
 .../kernels/gemm/arch/config_sm75_s16816.h    |  83 +++
 .../kernels/gemm/arch/config_sm80_s16816.h    |  91 +++
 src/turbomind/kernels/gemm/arch/mma_simt.h    |  71 ++
 src/turbomind/kernels/gemm/arch/mma_sm70.h    |  76 ++
 src/turbomind/kernels/gemm/arch/mma_sm80.h    |  74 ++
 .../kernels/gemm/arch/operand_simt.h          | 175 +++++
 .../kernels/gemm/arch/operand_sm70_s884.h     | 169 +++++
 .../kernels/gemm/arch/operand_sm80_s16816.h   | 249 +++++++
 .../kernels/gemm/arch/smem_copy_simt.h        | 102 +++
 .../kernels/gemm/arch/smem_copy_sm70.h        | 113 +++
 .../kernels/gemm/arch/smem_copy_sm80.h        | 207 ++++++
 src/turbomind/kernels/gemm/cast.cu            | 196 +++++
 src/turbomind/kernels/gemm/cast.h             |  39 +
 src/turbomind/kernels/gemm/convert_v2.cu      | 241 +++++++
 src/turbomind/kernels/gemm/convert_v2.h       | 201 ++++++
 src/turbomind/kernels/gemm/cp_async.h         | 211 ++++++
 src/turbomind/kernels/gemm/cta_map.h          |  86 +++
 src/turbomind/kernels/gemm/desc.h             |  92 +++
 src/turbomind/kernels/gemm/dispatch_cache.cu  | 414 +++++++++++
 src/turbomind/kernels/gemm/dispatch_cache.h   |  32 +
 src/turbomind/kernels/gemm/epilogue.h         | 465 ++++++++++++
 src/turbomind/kernels/gemm/format.h           |  74 ++
 src/turbomind/kernels/gemm/gemm.cu            | 402 +++++++++++
 src/turbomind/kernels/gemm/gemm.h             |  55 ++
 src/turbomind/kernels/gemm/gemm_universal.h   | 174 +++++
 src/turbomind/kernels/gemm/gpu_metric.cu      | 163 +++++
 src/turbomind/kernels/gemm/gpu_metric.h       |  15 +
 src/turbomind/kernels/gemm/iterator.h         |  61 ++
 src/turbomind/kernels/gemm/iterator_sm70.h    | 265 +++++++
 src/turbomind/kernels/gemm/iterator_sm80.h    | 213 ++++++
 src/turbomind/kernels/gemm/kernel.cu          | 234 ++++++
 src/turbomind/kernels/gemm/kernel.h           | 118 ++++
 .../kernel/f16_u4g128_f16_tnt_sm70_s884.cu    |  62 ++
 .../kernel/f16_u4g128_f16_tnt_sm75_s16816.cu  |  44 ++
 .../kernel/f16_u4g128_f16_tnt_sm75_simt.cu    |  45 ++
 .../kernel/f16_u4g128_f16_tnt_sm80_s16816.cu  |  97 +++
 .../kernel/f16_u4g128_f16_tnt_sm90_s16816.cu  |  68 ++
 .../kernel/u4g128_f16_f16_nnn_sm80_s16816.cu  | 118 ++++
 src/turbomind/kernels/gemm/kernel_impl.h      | 320 +++++++++
 src/turbomind/kernels/gemm/mainloop_sm70.h    | 354 ++++++++++
 src/turbomind/kernels/gemm/mainloop_sm80_v2.h | 390 ++++++++++
 src/turbomind/kernels/gemm/operand.h          |  66 ++
 src/turbomind/kernels/gemm/predicate.h        |  55 ++
 src/turbomind/kernels/gemm/registry.cu        |  39 +
 src/turbomind/kernels/gemm/registry.h         |  43 ++
 src/turbomind/kernels/gemm/simt.h             |  19 +
 src/turbomind/kernels/gemm/smem_copy.h        | 200 ++++++
 src/turbomind/kernels/gemm/test/gemm_bench.cu |  89 +++
 src/turbomind/kernels/gemm/test/gemm_test.cu  |  68 ++
 .../kernels/gemm/test/quantization.cu         |  27 +
 .../kernels/gemm/test/quantization.h          |  22 +
 .../kernels/gemm/test/quantization_impl.h     | 211 ++++++
 src/turbomind/kernels/gemm/test/reference.cu  | 109 +++
 src/turbomind/kernels/gemm/test/reference.h   |  24 +
 src/turbomind/kernels/gemm/test/test_utils.cu | 200 ++++++
 src/turbomind/kernels/gemm/test/test_utils.h  |  44 ++
 src/turbomind/kernels/gemm/test/testbed.h     | 422 +++++++++++
 src/turbomind/kernels/gemm/thread_group_map.h | 117 +++
 src/turbomind/kernels/gemm/thread_map.h       | 246 +++++++
 src/turbomind/kernels/gemm/tiled_mma.h        | 209 ++++++
 src/turbomind/kernels/gemm/transform.h        | 120 ++++
 .../kernels/gemm/tuner/cache_utils.cu         |  28 +
 .../kernels/gemm/tuner/cache_utils.h          |  21 +
 src/turbomind/kernels/gemm/tuner/measurer.cu  |  84 +++
 src/turbomind/kernels/gemm/tuner/measurer.h   |  41 ++
 src/turbomind/kernels/gemm/tuner/params.cc    | 104 +++
 src/turbomind/kernels/gemm/tuner/params.h     |  41 ++
 src/turbomind/kernels/gemm/tuner/sampler.cu   |  79 +++
 src/turbomind/kernels/gemm/tuner/sampler.h    |  23 +
 src/turbomind/kernels/gemm/tuner/stats.h      |  46 ++
 .../kernels/gemm/tuner/stopping_criterion.cc  |  36 +
 .../kernels/gemm/tuner/stopping_criterion.h   |  16 +
 src/turbomind/kernels/gemm/types.h            | 240 +++++++
 src/turbomind/kernels/gemm/unpack.cu          |  87 +++
 src/turbomind/kernels/gemm/utils.h            | 128 ++++
 .../kernels/gemm_s_f16/CMakeLists.txt         |   7 -
 src/turbomind/kernels/gemm_s_f16/common.h     | 668 ------------------
 .../kernels/gemm_s_f16/cta_iterator.h         | 646 -----------------
 src/turbomind/kernels/gemm_s_f16/format.cu    | 144 ----
 src/turbomind/kernels/gemm_s_f16/format.h     |  32 -
 .../kernels/gemm_s_f16/gemm_s4_f16.cu         | 309 --------
 .../kernels/gemm_s_f16/gemm_s4_f16.h          |  59 --
 .../kernels/gemm_s_f16/gemm_s4_f16_kernel.h   | 203 ------
 .../kernels/gemm_s_f16/gemm_template.h        | 391 ----------
 src/turbomind/kernels/gemm_s_f16/metric.h     | 112 ---
 .../kernels/gemm_s_f16/warp_iterator.h        | 166 -----
 .../kernels/unfused_attention_kernels.cu      |   2 +-
 src/turbomind/models/llama/CMakeLists.txt     |   3 +-
 src/turbomind/models/llama/LlamaBatch.cc      |   4 +-
 .../models/llama/LlamaDecoderLayerWeight.cc   | 336 +++++++--
 .../models/llama/LlamaDecoderLayerWeight.h    |   4 +
 src/turbomind/models/llama/LlamaDenseWeight.h |   9 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |  67 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |  20 +-
 src/turbomind/models/llama/LlamaLinear.cu     | 221 ++++++
 src/turbomind/models/llama/LlamaLinear.h      | 122 +---
 src/turbomind/models/llama/LlamaV2.cc         | 109 ++-
 src/turbomind/models/llama/LlamaV2.h          |   4 +
 src/turbomind/models/llama/LlamaWeight.cc     |  19 +
 src/turbomind/models/llama/LlamaWeight.h      |   2 +
 src/turbomind/models/llama/llama_kernels.cu   |   3 +-
 src/turbomind/models/llama/llama_utils.cu     |   2 +-
 .../models/llama/unified_attention_layer.cc   |  33 +-
 .../models/llama/unified_attention_layer.h    |  12 +-
 src/turbomind/models/llama/unified_decoder.cc |   4 +-
 src/turbomind/models/llama/unified_decoder.h  |  12 +-
 src/turbomind/python/bind.cpp                 |  76 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |  58 +-
 .../triton_backend/llama/LlamaTritonModel.h   |   7 +
 .../transformer_triton_backend.hpp            |   7 +
 src/turbomind/utils/CMakeLists.txt            |   3 +
 src/turbomind/utils/cuda_utils.cc             |   7 +
 src/turbomind/utils/cuda_utils.h              |   2 +
 src/turbomind/utils/parser.cc                 |  39 +
 src/turbomind/utils/parser.h                  |  30 +
 .../test_turbomind/test_converter.py          |  61 +-
 189 files changed, 12977 insertions(+), 4983 deletions(-)
 create mode 100644 lmdeploy/turbomind/deploy/exporter.py
 create mode 100644 lmdeploy/turbomind/deploy/policy.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/deepseek_vl_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/glm4_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/llama_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/minicpmv_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/qwen_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
 delete mode 100644 lmdeploy/turbomind/deploy/target_model/plora.py
 delete mode 100644 lmdeploy/turbomind/deploy/target_model/plora_w4.py
 delete mode 100644 lmdeploy/turbomind/deploy/target_model/w4.py
 create mode 100644 src/turbomind/kernels/attention/rotary_embedding.h
 create mode 100644 src/turbomind/kernels/core/array.h
 rename src/turbomind/kernels/{attention => core}/array_ops.h (55%)
 create mode 100644 src/turbomind/kernels/core/common.h
 rename src/turbomind/kernels/{attention => core}/data_type.h (59%)
 rename src/turbomind/kernels/{attention/smem_layout.h => core/layout.h} (80%)
 create mode 100644 src/turbomind/kernels/core/math.h
 create mode 100644 src/turbomind/kernels/core/meta.h
 create mode 100644 src/turbomind/kernels/core/mma.h
 create mode 100644 src/turbomind/kernels/core/pipe_iter.h
 create mode 100644 src/turbomind/kernels/core/smem.h
 create mode 100644 src/turbomind/kernels/core/sub_byte_ptr.h
 create mode 100644 src/turbomind/kernels/core/sync.h
 rename src/turbomind/kernels/{attention => core}/thread_map.h (99%)
 create mode 100644 src/turbomind/kernels/gemm/CMakeLists.txt
 create mode 100644 src/turbomind/kernels/gemm/arch.h
 create mode 100644 src/turbomind/kernels/gemm/arch/config_simt.h
 create mode 100644 src/turbomind/kernels/gemm/arch/config_sm70_s884.h
 create mode 100644 src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
 create mode 100644 src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
 create mode 100644 src/turbomind/kernels/gemm/arch/mma_simt.h
 create mode 100644 src/turbomind/kernels/gemm/arch/mma_sm70.h
 create mode 100644 src/turbomind/kernels/gemm/arch/mma_sm80.h
 create mode 100644 src/turbomind/kernels/gemm/arch/operand_simt.h
 create mode 100644 src/turbomind/kernels/gemm/arch/operand_sm70_s884.h
 create mode 100644 src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
 create mode 100644 src/turbomind/kernels/gemm/arch/smem_copy_simt.h
 create mode 100644 src/turbomind/kernels/gemm/arch/smem_copy_sm70.h
 create mode 100644 src/turbomind/kernels/gemm/arch/smem_copy_sm80.h
 create mode 100644 src/turbomind/kernels/gemm/cast.cu
 create mode 100644 src/turbomind/kernels/gemm/cast.h
 create mode 100644 src/turbomind/kernels/gemm/convert_v2.cu
 create mode 100644 src/turbomind/kernels/gemm/convert_v2.h
 create mode 100644 src/turbomind/kernels/gemm/cp_async.h
 create mode 100644 src/turbomind/kernels/gemm/cta_map.h
 create mode 100644 src/turbomind/kernels/gemm/desc.h
 create mode 100644 src/turbomind/kernels/gemm/dispatch_cache.cu
 create mode 100644 src/turbomind/kernels/gemm/dispatch_cache.h
 create mode 100644 src/turbomind/kernels/gemm/epilogue.h
 create mode 100644 src/turbomind/kernels/gemm/format.h
 create mode 100644 src/turbomind/kernels/gemm/gemm.cu
 create mode 100644 src/turbomind/kernels/gemm/gemm.h
 create mode 100644 src/turbomind/kernels/gemm/gemm_universal.h
 create mode 100644 src/turbomind/kernels/gemm/gpu_metric.cu
 create mode 100644 src/turbomind/kernels/gemm/gpu_metric.h
 create mode 100644 src/turbomind/kernels/gemm/iterator.h
 create mode 100644 src/turbomind/kernels/gemm/iterator_sm70.h
 create mode 100644 src/turbomind/kernels/gemm/iterator_sm80.h
 create mode 100644 src/turbomind/kernels/gemm/kernel.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel.h
 create mode 100644 src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
 create mode 100644 src/turbomind/kernels/gemm/kernel_impl.h
 create mode 100644 src/turbomind/kernels/gemm/mainloop_sm70.h
 create mode 100644 src/turbomind/kernels/gemm/mainloop_sm80_v2.h
 create mode 100644 src/turbomind/kernels/gemm/operand.h
 create mode 100644 src/turbomind/kernels/gemm/predicate.h
 create mode 100644 src/turbomind/kernels/gemm/registry.cu
 create mode 100644 src/turbomind/kernels/gemm/registry.h
 create mode 100644 src/turbomind/kernels/gemm/simt.h
 create mode 100644 src/turbomind/kernels/gemm/smem_copy.h
 create mode 100644 src/turbomind/kernels/gemm/test/gemm_bench.cu
 create mode 100644 src/turbomind/kernels/gemm/test/gemm_test.cu
 create mode 100644 src/turbomind/kernels/gemm/test/quantization.cu
 create mode 100644 src/turbomind/kernels/gemm/test/quantization.h
 create mode 100644 src/turbomind/kernels/gemm/test/quantization_impl.h
 create mode 100644 src/turbomind/kernels/gemm/test/reference.cu
 create mode 100644 src/turbomind/kernels/gemm/test/reference.h
 create mode 100644 src/turbomind/kernels/gemm/test/test_utils.cu
 create mode 100644 src/turbomind/kernels/gemm/test/test_utils.h
 create mode 100644 src/turbomind/kernels/gemm/test/testbed.h
 create mode 100644 src/turbomind/kernels/gemm/thread_group_map.h
 create mode 100644 src/turbomind/kernels/gemm/thread_map.h
 create mode 100644 src/turbomind/kernels/gemm/tiled_mma.h
 create mode 100644 src/turbomind/kernels/gemm/transform.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/cache_utils.cu
 create mode 100644 src/turbomind/kernels/gemm/tuner/cache_utils.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/measurer.cu
 create mode 100644 src/turbomind/kernels/gemm/tuner/measurer.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/params.cc
 create mode 100644 src/turbomind/kernels/gemm/tuner/params.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/sampler.cu
 create mode 100644 src/turbomind/kernels/gemm/tuner/sampler.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/stats.h
 create mode 100644 src/turbomind/kernels/gemm/tuner/stopping_criterion.cc
 create mode 100644 src/turbomind/kernels/gemm/tuner/stopping_criterion.h
 create mode 100644 src/turbomind/kernels/gemm/types.h
 create mode 100644 src/turbomind/kernels/gemm/unpack.cu
 create mode 100644 src/turbomind/kernels/gemm/utils.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/common.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/cta_iterator.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/format.cu
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/format.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.cu
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/gemm_s4_f16_kernel.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/gemm_template.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/metric.h
 delete mode 100644 src/turbomind/kernels/gemm_s_f16/warp_iterator.h
 create mode 100644 src/turbomind/models/llama/LlamaLinear.cu
 create mode 100644 src/turbomind/utils/parser.cc
 create mode 100644 src/turbomind/utils/parser.h

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index fdad879588..49efb8d8f7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Check docstring coverage
         run: |
           python -m pip install interrogate
-          interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 80 lmdeploy
+          interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 70 lmdeploy
       - name: Check pylint score
         run: |
           python -m pip install pylint
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a280bf0133..c6c0a45bf3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=third_party/*,*.ipynb,*.proto"]
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]
 
   - repo: https://github.com/myint/docformatter
     rev: v1.4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 173a689f47..1d5abdad22 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,6 +143,9 @@ if (NOT CMAKE_CUDA_ARCHITECTURES)
   if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8")
     list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-real)
   endif ()
+  if (MSVC)
+    list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES 80-real 90-real)
+  endif ()
 endif ()
 
 message(STATUS "Building with CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -322,7 +325,6 @@ add_library(transformer-shared SHARED
   $<TARGET_OBJECTS:DynamicDecodeLayer>
   $<TARGET_OBJECTS:Llama>
   $<TARGET_OBJECTS:LlamaTritonBackend>
-  $<TARGET_OBJECTS:gemm_s4_f16>
   $<TARGET_OBJECTS:TopKSamplingLayer>
   $<TARGET_OBJECTS:TopPSamplingLayer>
   $<TARGET_OBJECTS:TransformerTritonBackend>
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 00eb7b8cf7..37a3ef7f0b 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -106,7 +106,7 @@ def model_format(parser, default: str = None):
             '--model-format',
             type=str,
             default=default,
-            choices=['hf', 'llama', 'awq'],
+            choices=['hf', 'llama', 'awq', 'gptq'],
             help='The format of input model. `hf` meaning `hf_llama`, `llama` '
             'meaning `meta_llama`, `awq` meaning the quantized model by awq')
 
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 28012a7263..60c93c9047 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -13,10 +13,12 @@
 
 from ...utils import _get_and_verify_max_len
 from ..supported_models import SUPPORTED_ARCHS, is_supported
+from .exporter import get_exporter_factory
+from .policy import get_input_policy
 from .source_model.base import INPUT_MODELS
 from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
 
-SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', None]
+SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', 'gptq', None]
 logger = get_logger('lmdeploy')
 
 
@@ -31,8 +33,6 @@ def get_input_model_registered_name(model_path: str, model_format: str):
     """
     arch = get_model_arch(model_path)[0]
     register_name = SUPPORTED_ARCHS[arch]
-    if model_format == 'awq':
-        register_name = register_name + '-awq'
     return register_name
 
 
@@ -96,7 +96,7 @@ def get_output_model_registered_name_and_config(model_path: str,
             ['meta_llama',  'hf', 'awq']
         group_size (int): the size of group used by awq model
     """
-    register_name = 'fp16'
+    register_name = 'tm'
     turbomind_model_arch = 'llama'
     weight_type = 'fp16'
 
@@ -105,14 +105,11 @@ def get_output_model_registered_name_and_config(model_path: str,
     if model_format == 'meta_llama':
         session_len = 2048
     else:  # hf, awq, None
-        register_name = 'fp16'
         model_arch, model_config = get_model_arch(model_path)
         turbomind_model_arch = SUPPORTED_ARCHS[model_arch]
         session_len = _get_and_verify_max_len(model_config, None)
-        if model_format == 'awq':
+        if model_format in ['awq', 'gptq']:
             weight_type = 'int4'
-            register_name = 'plora-w4' \
-                if turbomind_model_arch == 'xcomposer2' else 'w4'
             group_size = 128 if group_size == 0 else group_size
         else:
             torch_dtype = getattr(model_config, 'torch_dtype', 'float16')
@@ -127,16 +124,16 @@ def get_output_model_registered_name_and_config(model_path: str,
                     'Device does not support bfloat16. Set float16 forcefully')
                 weight_type = 'fp16'
 
-            register_name = weight_type
-            if turbomind_model_arch == 'xcomposer2':
-                register_name = 'plora'
-
     config.model_arch = model_arch
     config.session_len = session_len + 8
     config.weight_type = weight_type
     config.group_size = group_size
 
-    return register_name, config
+    lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else ''
+
+    exporter_factory = get_exporter_factory(weight_type, lora_type)
+
+    return register_name, config, exporter_factory
 
 
 def pack_model_repository(workspace_path: str):
@@ -178,20 +175,26 @@ def get_tm_model(model_path,
 
     input_model_name = get_input_model_registered_name(
         model_path, engine_config.model_format)
+    input_policy = get_input_policy(engine_config.model_format)
     input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
-                                                     tokenizer_path=model_path)
+                                                     tokenizer_path=model_path,
+                                                     input_policy=input_policy)
 
-    output_model_name, cfg = get_output_model_registered_name_and_config(
-        model_path=model_path,
-        model_format=engine_config.model_format,
-        group_size=group_size)
+    output_model_name, cfg, exporter_factory = \
+        get_output_model_registered_name_and_config(
+            model_path=model_path,
+            model_format=engine_config.model_format,
+            group_size=group_size)
 
     cfg.chat_template = chat_template_name
     cfg.model_name = model_name
     cfg.update_from_engine_config(engine_config)
 
     output_model = OUTPUT_MODELS.get(output_model_name)(
-        input_model=input_model, cfg=cfg, out_dir=out_dir)
+        input_model=input_model,
+        cfg=cfg,
+        exporter_factory=exporter_factory,
+        out_dir=out_dir)
 
     return output_model
 
diff --git a/lmdeploy/turbomind/deploy/exporter.py b/lmdeploy/turbomind/deploy/exporter.py
new file mode 100644
index 0000000000..48f9312fa6
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/exporter.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABC, abstractmethod
+
+import torch
+
+from .target_model.base import BaseOutputModel, BaseReader
+
+
+def permute_v2(x: torch.Tensor, size_per_head: int = 128):
+    """
+        Contract: x.size(-1) is output dims
+    """
+
+    assert x.size(-1) > 1
+
+    output_dims = x.size(-1)
+    head_num = output_dims // size_per_head
+
+    return x.view(-1, head_num, 2,
+                  size_per_head // 2).transpose(2, 3).reshape(x.shape)
+
+
+def merge_qkv_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int):
+    """
+        Contract: x.size(-1) is output dims
+    """
+
+    def reshape(x):
+        return x.view(x.size(0), tp, -1) if q.dim() == 2 else x.view(tp, -1)
+
+    qkv = torch.cat(tuple(map(reshape, (q, k, v))), dim=-1)
+
+    qkv = qkv.view(-1, qkv.size(-1) * tp)
+    if q.dim() == 1:
+        qkv.squeeze_()
+
+    return qkv
+
+
+def identity(x):
+    return x
+
+
+def transpose(x):
+    return x.t() if x is not None else x
+
+
+def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.uint8
+    xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
+    a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
+    for t in reversed(xs):
+        a = (a << 4) | t
+    return a.squeeze(dim=-1)
+
+
+def pad_out_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(-1)
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, pad), 'constant', 0)
+
+
+def pad_in_dims(x: torch.Tensor, dims: int):
+    pad = dims - x.size(0)
+    assert x.dim() == 2
+    assert pad >= 0
+    return torch.nn.functional.pad(x, (0, 0, 0, pad), 'constant', 0)
+
+
+class BaseExporter(ABC):
+
+    _attn = 'layers.{0}.attention.{1}.{2}'
+    _ffn = 'layers.{0}.feed_forward.{1}.{2}'
+
+    def __init__(self, model: BaseOutputModel):
+        self.model = model
+        self.tp = model.cfg.tensor_para_size
+        self.head_dim = model.cfg.size_per_head
+        self.inter_size = model.cfg.inter_size
+
+    def export_attn(self, idx: int, qkvo, kind: str, pack_fn=identity):
+        if all(x is None for x in qkvo):
+            return
+        is_lora_a, is_lora_b = self.get_lora_flags(kind)
+        q, k, v, o = map(transpose, qkvo)
+        if self.model.permute_qk:
+            q = permute_v2(q, self.head_dim)
+            k = permute_v2(k, self.head_dim)
+        qkv = merge_qkv_v2(q, k, v, self.tp)
+        if o is None and q.dim() == 1:
+            o = torch.zeros_like(q)
+        qkv = pack_fn(qkv)
+        o = pack_fn(o)
+        self.model.save_split(qkv,
+                              self._attn.format(idx, 'w_qkv', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(o,
+                              self._attn.format(idx, 'wo', kind),
+                              split_dim=0,
+                              copy=is_lora_b)
+
+    def export_ffn(self, idx: int, w123, kind: str, pack_fn=identity, g=1):
+        is_lora_a, is_lora_b = self.get_lora_flags(kind)
+        w1, w2, w3 = map(transpose, w123)
+
+        if not is_lora_a:
+            w1 = pad_out_dims(w1, self.inter_size)
+            w3 = pad_out_dims(w3, self.inter_size)
+        if not is_lora_b:
+            w2 = pad_in_dims(w2, self.inter_size // g)
+
+        w1, w2, w3 = map(pack_fn, (w1, w2, w3))
+        self.model.save_split(w1,
+                              self._ffn.format(idx, 'w1', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(w3,
+                              self._ffn.format(idx, 'w3', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(w2,
+                              self._ffn.format(idx, 'w2', kind),
+                              split_dim=0,
+                              copy=is_lora_b)
+
+    # split out dims -> copy A, split-out-dims B (qkv, w1, w3)
+    # split  in dims -> split-in-dims A,  copy B (  o, w2)
+    def get_lora_flags(self, kind: str):
+        return ('lora_a' in kind, 'lora_b' in kind)
+
+    @abstractmethod
+    def export(self, r: BaseReader, idx: int):
+        pass
+
+
+class WeightExporter(BaseExporter):
+
+    def export(self, r: BaseReader, i: int):
+        self.export_attn(i, r.attn(i), 'weight')
+        self.export_attn(i, r.attn_bias(i), 'bias')
+        self.export_ffn(i, r.ffn(i), 'weight')
+
+
+class LayerNormExporter(BaseExporter):
+
+    def export(self, r: BaseReader, i: int):
+        attn_norm = r.attn_norm(i)
+        ffn_norm = r.ffn_norm(i)
+        self.model.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.model.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
+
+
+class QuantWeightExporter(BaseExporter):
+
+    def __init__(self, model: BaseOutputModel, pack_fn):
+        super().__init__(model)
+        self.pack_fn = pack_fn
+        self.group_size = model.cfg.group_size
+
+    def export(self, r: BaseReader, i: int):
+
+        def to_half(x: torch.Tensor):
+            return x.to(torch.half)
+
+        self.export_attn(i, r.attn(i), 'qweight', self.pack_fn)
+        self.export_attn(i, r.attn_bias(i), 'bias', to_half)
+        self.export_attn(i, r.attn_scale(i), 'scales', to_half)
+        self.export_attn(i, r.attn_zero(i), 'zeros', to_half)
+        self.export_ffn(i, r.ffn(i), 'qweight', self.pack_fn)
+        self.export_ffn(i, r.ffn_scale(i), 'scales', to_half, self.group_size)
+        self.export_ffn(i, r.ffn_zero(i), 'zeros', to_half, self.group_size)
+
+
+class PLoraExporter(BaseExporter):
+
+    def export_attn_lora_a(self, idx: int, ws, kind: str):
+        is_lora_a, is_lora_b = self.get_lora_flags(kind)
+        qkv, o = map(transpose, ws)
+        self.model.save_split(qkv,
+                              self._attn.format(idx, 'w_qkv', kind),
+                              split_dim=-1,
+                              copy=is_lora_a)
+        self.model.save_split(o,
+                              self._attn.format(idx, 'wo', kind),
+                              split_dim=0,
+                              copy=is_lora_b)
+
+    def export(self, r: BaseReader, i: int):
+        self.export_attn_lora_a(i, r.attn_lora_a(i), 'lora_a.weight')
+        self.export_attn(i, r.attn_lora_b(i), 'lora_b.weight')
+        self.export_ffn(i, r.ffn_lora_a(i), 'lora_a.weight')
+        self.export_ffn(i, r.ffn_lora_b(i), 'lora_b.weight')
+
+
+def get_exporter_factory(weight_type, lora_type):
+
+    def get_exporters(model: BaseOutputModel):
+        exporters = [LayerNormExporter(model)]
+
+        if weight_type == 'int4':
+            exporters.append(QuantWeightExporter(model, pack_u4_row))
+        else:
+            exporters.append(WeightExporter(model))
+
+        if lora_type == 'plora':
+            exporters.append(PLoraExporter(model))
+
+        return exporters
+
+    return get_exporters
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
new file mode 100644
index 0000000000..bc4db7ddae
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/policy.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch.cuda
+
+
+def to_cuda(x: torch.Tensor, *args):
+    return x.cuda()
+
+
+def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
+    assert x.dtype == torch.int32
+    xs = []
+    for _ in range(8):
+        xs.append((x & 15).to(dtype))
+        x = x >> 4
+    return xs
+
+
+def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
+    xs = get_u4_slices(x, torch.uint8)
+    order = [0, 4, 1, 5, 2, 6, 3, 7]
+    ys = [xs[i] for i in order]
+    return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
+
+
+def process_awq_gemm(x: torch.Tensor, kind: str):
+    x = x.cuda()
+    if x.dtype == torch.int32:
+        x = unpack_awq_gemm(x)
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
+
+
+def process_gptq(x: torch.Tensor, kind: str):
+    x = x.cuda()
+    if x.dtype == torch.int32:
+        xs = get_u4_slices(x, torch.uint8)
+        if kind == 'qweight':  # (k/8,n)
+            x = torch.stack(xs, dim=1).view(-1, x.size(-1))
+        else:  # 'qzeros' (k/g,n/8)
+            x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
+
+
+def get_input_policy(model_format):
+    if model_format == 'awq':
+        return ('qweight', process_awq_gemm)
+    elif model_format == 'gptq':
+        return ('qweight', process_gptq)
+    else:
+        return ('weight', to_cuda)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
index 5ba4badb5a..9ca06b0cc9 100644
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -1,18 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
-from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
-from .deepseek_vl_awq import DeepSeekVLAwqModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
-from .glm4_awq import Glm4AwqModel  # noqa: F401
-from .internlm2 import InternLM2AwqModel, InternLM2Model  # noqa: F401
+from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
-from .llama_awq import LlamaAwqModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401
 from .minicpmv import MiniCPMVModel  # noqa: F401
-from .minicpmv_awq import MiniCPMVAwqModel  # noqa: F401
 from .qwen import QwenModel  # noqa: F401
-from .qwen_awq import QwenAwqModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
-from .xcomposer2_awq import Xcomposer2AwqModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py
index d024af070a..51ca34b55a 100644
--- a/lmdeploy/turbomind/deploy/source_model/baichuan.py
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py
@@ -9,36 +9,28 @@
 class BaichuanReader(LlamaReader):
     """BaichuanReader."""
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+    def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
-        result = []
+        q, k, v, o = (None, ) * 4
         pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
-        qkv = self.params[pack_key]
-        result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim))
-        o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}']
-        result.append(o)
-        return (*result, )
+        qkv = self.transform(self.params.get(pack_key), kind)
+        if qkv is not None:
+            q, k, v = torch.split(qkv, qkv.shape[0] // 3, dim=0)
+        o = self.params.get(f'model.layers.{i}.self_attn.o_proj.{kind}')
+        o = self.transform(o, kind)
+        return q, k, v, o
+
 
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight', 0, 0)
+@INPUT_MODELS.register_module(name='baichuan')
+class BaichuanModel(LlamaModel):
+    """Llama model in baichuan format."""
 
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return (None, ) * 4
+    Reader = BaichuanReader
 
 
 class Baichuan2Reader(BaichuanReader):
     """Baichuan2Reader."""
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
     def output_weight(self):
         """Get output."""
         # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
@@ -49,21 +41,8 @@ def output_weight(self):
         return tensor
 
 
-@INPUT_MODELS.register_module(name='baichuan')
-class BaichuanModel(LlamaModel):
-    """Llama model in baichuan format."""
-
-    Reader = BaichuanReader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-
-
 @INPUT_MODELS.register_module(name='baichuan2')
 class Baichuan2Model(LlamaModel):
     """Llama model in baichuan format."""
 
     Reader = Baichuan2Reader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
-        super().__init__(model_path, tokenizer_path, **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
deleted file mode 100644
index 36d3b90107..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader
-from .base import INPUT_MODELS
-from .llama_awq import ensure_fp16orint32
-
-
-class BaichuanAwqReader(BaichuanReader):
-    """BaichuanAwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-class Baichuan2AwqReader(BaichuanAwqReader):
-    """Baichuan2AwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def output_weight(self):
-        """Get output."""
-        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
-        tensor = self.params.get('lm_head.weight', None)
-        if tensor is not None:
-            tensor = tensor.cuda()
-            tensor = torch.nn.functional.normalize(tensor)
-        return tensor
-
-
-@INPUT_MODELS.register_module(name='baichuan-awq')
-class BaichuanAwqModel(BaichuanModel):
-    """Baichuan awq model in hf format."""
-
-    Reader = BaichuanAwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
-
-
-@INPUT_MODELS.register_module(name='baichuan2-awq')
-class Baichuan2AwqModel(Baichuan2Model):
-    """Baichuan2 awq model in hf format."""
-
-    Reader = Baichuan2AwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
index c335b4c10b..c0e468d645 100644
--- a/lmdeploy/turbomind/deploy/source_model/base.py
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
@@ -74,6 +74,15 @@ def clean_up(self, last: bool) -> None:
                 self.params.pop(key, None)
         torch.cuda.empty_cache()
 
+    def transform(self, x: Union[torch.Tensor, None],
+                  kind: str) -> Union[torch.Tensor, None]:
+        return None if x is None else self._transform(x, kind)
+
+    @abstractmethod
+    def _transform(self, x: torch.Tensor, kind: str):
+        """Transform x."""
+        pass
+
     @abstractmethod
     def tok_embeddings(self) -> Union[torch.Tensor, None]:
         """Get embeddings."""
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
index 74c5e799af..f17a3398b3 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -9,99 +9,23 @@
 class DeepSeekVLReader(LlamaReader):
     """DeepSeekVL model reader."""
 
+    attn_layer_prefix = 'language_model.model.layers'
     attn_layer_patten = r'language_model.model.layers.([0-9]+).'
     tok_embeddings_key = 'language_model.model.embed_tokens.weight'
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
 
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
+                 model_cfg: dict, **kwargs):
         model_cfg = model_cfg['language_config']
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def init_layer_id(self):
-        """Get start/end transformer layer id."""
-        super().init_layer_id()
-
-    def clean_up(self, last: bool) -> None:
-        """Clean up unused params."""
-        super().clean_up(last)
-
-    @property
-    def start_layer_id(self):
-        """Get start transformer layer id."""
-        return self._start_layer_id
-
-    @property
-    def end_layer_id(self):
-        """Get end transformer layer id."""
-        return self._end_layer_id
-
-    def tok_embeddings(self):
-        """Get embeddings."""
-        return self.params.get(self.tok_embeddings_key, None)
-
-    def norm_weight(self):
-        """Get norm."""
-        return self.params.get(self.norm_weight_key, None)
-
-    def output_weight(self):
-        """Get output."""
-        return self.params.get(self.output_weight_key, None)
-
-    def _attn(self, i: int, kind: str, allow_none=False):
-        """Get q, k, v, o kind for layer i."""
-        result = []
-        for key in ['q', 'k', 'v', 'o']:
-            tensor = self.params.get(
-                f'language_model.model.layers.{i}.self_attn.{key}_proj.{kind}')
-            if not allow_none:
-                assert tensor is not None
-            result.append(tensor)
-        return (*result, )
-
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight')
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias', allow_none=True)
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o zero point for layer i."""
-        return (None, ) * 4
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scale for layer i."""
-        return (None, ) * 4
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         **kwargs)
 
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
         return self.params[
             f'language_model.model.layers.{i}.input_layernorm.weight']
 
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        result = []
-        for key in ['gate', 'down', 'up']:
-            tensor = self.params[
-                f'language_model.model.layers.{i}.mlp.{key}_proj.{kind}']
-            result.append(tensor)
-        return (*result, )
-
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return (None, ) * 3
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return (None, ) * 3
-
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
         return self.params[
@@ -114,9 +38,6 @@ class DeepSeekVLModel(LlamaModel):
 
     Reader = DeepSeekVLReader
 
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-
     def model_info(self):
         """Read model info."""
         params_path = osp.join(self.model_path, 'config.json')
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl_awq.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl_awq.py
deleted file mode 100644
index 5da7ccabed..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl_awq.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import INPUT_MODELS
-from .deepseek_vl import DeepSeekVLModel, DeepSeekVLReader
-from .llama_awq import ensure_fp16orint32
-
-
-class DeepSeekVLAwqReader(DeepSeekVLReader):
-    """LlamaAwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qweight'))
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qzeros'))
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'scales'))
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-@INPUT_MODELS.register_module(name='deepseekvl-awq')
-class DeepSeekVLAwqModel(DeepSeekVLModel):
-    """Llama Awq model in hf format."""
-
-    Reader = DeepSeekVLAwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py
index 5fc8eb0257..2c69d5d0da 100644
--- a/lmdeploy/turbomind/deploy/source_model/glm4.py
+++ b/lmdeploy/turbomind/deploy/source_model/glm4.py
@@ -17,14 +17,11 @@ class Glm4Reader(LlamaReader):
     norm_weight_key = 'transformer.encoder.final_layernorm.weight'
     output_weight_key = 'transformer.output_layer.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+    def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
         qkv = self.params[f'transformer.encoder.layers.{i}'
                           f'.self_attention.query_key_value.{kind}']
+        qkv = self.transform(qkv, kind)
         attn_head_num = self.model_cfg['num_attention_heads']
         kv_head_num = attn_head_num
         if self.model_cfg.get('multi_query_attention', False):
@@ -34,30 +31,14 @@ def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
             attn_head_num * HEAD_DIM, kv_head_num * HEAD_DIM,
             kv_head_num * HEAD_DIM
         ],
-                              dim=size_dim)
+                              dim=0)
         o = self.params.get(
-            f'transformer.encoder.layers.{i}.self_attention.dense.{kind}',
-            None)
+            f'transformer.encoder.layers.{i}.self_attention.dense.{kind}')
+        o = self.transform(o, kind)
         if o is None:  # handle the case when qkv has bias but o doesn't
             o = torch.zeros_like(q)
         return q, k, v, o
 
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight', 0, 0)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias', -1, 0)
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o zero point for layer i."""
-        return (None, ) * 4
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scale for layer i."""
-        return (None, ) * 4
-
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
         return self.params[
@@ -67,24 +48,13 @@ def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
         up_and_gate = self.params[
             f'transformer.encoder.layers.{i}.mlp.dense_h_to_4h.{kind}']
+        up_and_gate = self.transform(up_and_gate, kind)
         up, gate = up_and_gate.chunk(2, dim=0)
         down = self.params[
             f'transformer.encoder.layers.{i}.mlp.dense_4h_to_h.{kind}']
-
+        down = self.transform(down, kind)
         return (up, down, gate)
 
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return (None, ) * 3
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return (None, ) * 3
-
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
         return self.params[
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4_awq.py b/lmdeploy/turbomind/deploy/source_model/glm4_awq.py
deleted file mode 100644
index f27a5f6ce5..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/glm4_awq.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-import torch
-
-from .base import INPUT_MODELS
-from .glm4 import Glm4Model, Glm4Reader
-
-
-class Glm4AwqReader(Glm4Reader):
-    """Glm4AwqReader."""
-
-    attn_layer_patten = r'transformer.encoder.layers.([0-9]+).'
-    tok_embeddings_key = 'transformer.embedding.word_embeddings.weight'
-    norm_weight_key = 'transformer.encoder.final_layernorm.weight'
-    output_weight_key = 'transformer.output_layer.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
-        """Get q, k, v, o kind for layer i."""
-        qkv = self.params[f'transformer.encoder.layers.{i}'
-                          f'.self_attention.query_key_value.{kind}']
-        attn_head_num = self.model_cfg['num_attention_heads']
-        kv_head_num = attn_head_num
-        if self.model_cfg.get('multi_query_attention', False):
-            kv_head_num = self.model_cfg['multi_query_group_num']
-        HEAD_DIM = int(qkv.shape[size_dim] / (attn_head_num + kv_head_num * 2))
-        q, k, v = torch.split(qkv, [
-            attn_head_num * HEAD_DIM, kv_head_num * HEAD_DIM,
-            kv_head_num * HEAD_DIM
-        ],
-                              dim=size_dim)
-        o = self.params.get(
-            f'transformer.encoder.layers.{i}.self_attention.dense.{kind}',
-            None)
-        if o is None:  # handle the case when qkv has bias but o doesn't
-            o = torch.zeros_like(q)
-        return q, k, v, o
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return self._attn(i, 'qweight', -1, -1)
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return self._attn(i, 'qzeros', -1, -1)
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return self._attn(i, 'scales', -1, -1)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias', -1, 0)
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        up_and_gate = self.params[
-            f'transformer.encoder.layers.{i}.mlp.dense_h_to_4h.{kind}']
-        up, gate = up_and_gate.chunk(2, dim=-1)
-        down = self.params[
-            f'transformer.encoder.layers.{i}.mlp.dense_4h_to_h.{kind}']
-
-        return (up, down, gate)
-
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'qweight')
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return self._ffn(i, 'qzeros')
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return self._ffn(i, 'scales')
-
-
-@INPUT_MODELS.register_module(name='glm4-awq')
-class Glm4AwqModel(Glm4Model):
-    """Glm2/3/4 model in hf format."""
-
-    Reader = Glm4AwqReader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        config_path = osp.join(self.model_path, 'config.json')
-        with open(config_path) as f:
-            self.config = json.load(f)
diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py
index d918954cc0..71f17517b3 100644
--- a/lmdeploy/turbomind/deploy/source_model/internlm2.py
+++ b/lmdeploy/turbomind/deploy/source_model/internlm2.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+
 import torch
 
 from .base import INPUT_MODELS
 from .llama import LlamaModel, LlamaReader
-from .llama_awq import ensure_fp16orint32
 
 
 class InternLM2Reader(LlamaReader):
@@ -15,40 +15,25 @@ class InternLM2Reader(LlamaReader):
     norm_weight_key = 'model.norm.weight'
     output_weight_key = 'output.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+    def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
+        q, k, v = (None, ) * 3
         kv_head_num = self.model_cfg['num_key_value_heads']
         gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-        qkv = self.params[
-            f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
-        qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
-        hidden_dim = qkv.shape[-1]
-        q, k, v = torch.split(qkv, [gs, 1, 1], dim=1)
-        q = q.reshape(-1, hidden_dim)
-        k = k.reshape(-1, hidden_dim)
-        v = v.reshape(-1, hidden_dim)
+        qkv = self.params.get(
+            f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}')
+        qkv = self.transform(qkv, kind)
+        if qkv is not None:
+            qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
+            hidden_dim = qkv.shape[-1]
+            q, k, v = torch.split(qkv, [gs, 1, 1], dim=1)
+            q = q.reshape(-1, hidden_dim)
+            k = k.reshape(-1, hidden_dim)
+            v = v.reshape(-1, hidden_dim)
         o = self.params.get(
             f'{self.attn_layer_prefix}.{i}.attention.wo.{kind}')
-        return q, k, v, o
-
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight', 0, 0)
-
-    def attn_bias(self, i: int):
-        return (None, ) * 4
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o zero point for layer i."""
-        return (None, ) * 4
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scale for layer i."""
-        return (None, ) * 4
+        o = self.transform(o, kind)
+        return (q, k, v, o)
 
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
@@ -61,21 +46,10 @@ def _ffn(self, i: int, kind: str):
         for key in ['w1', 'w2', 'w3']:
             tensor = self.params[
                 f'{self.attn_layer_prefix}.{i}.feed_forward.{key}.{kind}']
+            tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return (None, ) * 3
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return (None, ) * 3
-
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
         return self.params[f'{self.attn_layer_prefix}.{i}.ffn_norm.weight']
@@ -86,64 +60,3 @@ class InternLM2Model(LlamaModel):
     """InternLM2 model in hf format."""
 
     Reader = InternLM2Reader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-
-
-class InternLM2AwqReader(InternLM2Reader):
-    """read weights from internlm2 awq model."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o qweight for layer i."""
-        kv_head_num = self.model_cfg['num_key_value_heads']
-        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-        qkv = self.params[
-            f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
-        hidden_dim = qkv.shape[0]
-        qkv = qkv.view(hidden_dim, kv_head_num, gs + 2, -1)
-        q, k, v = torch.split(qkv, [gs, 1, 1], dim=-2)
-        q = q.reshape(hidden_dim, -1)
-        k = k.reshape(hidden_dim, -1)
-        v = v.reshape(hidden_dim, -1)
-        o = self.params.get(
-            f'{self.attn_layer_prefix}.{i}.attention.wo.{kind}')
-        return ensure_fp16orint32((q, k, v, o))
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return self._attn(i, 'qweight')
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return self._attn(i, 'qzeros')
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return self._attn(i, 'scales')
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-@INPUT_MODELS.register_module(name='internlm2-awq')
-class InternLM2AwqModel(InternLM2Model):
-    """InternLM2 awq model."""
-
-    Reader = InternLM2AwqReader
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
index ba3b1a7485..d7f446da93 100644
--- a/lmdeploy/turbomind/deploy/source_model/internvl.py
+++ b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -3,10 +3,8 @@
 import os.path as osp
 
 from .base import INPUT_MODELS
-from .internlm2 import InternLM2AwqReader, InternLM2Reader
+from .internlm2 import InternLM2Reader
 from .llama import LlamaModel, LlamaReader
-from .llama_awq import LlamaAwqReader
-from .qwen import Qwen2Reader
 
 
 class InternVLReader(LlamaReader):
@@ -19,11 +17,13 @@ class InternVLReader(LlamaReader):
     output_weight_key = 'language_model.lm_head.weight'
 
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
+                 model_cfg: dict, **kwargs):
         model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         **kwargs)
 
 
+# Note the subtle difference in keys
 class InternVL2Reader(InternLM2Reader):
     """InternVLReader for InternLM2 model."""
 
@@ -34,24 +34,10 @@ class InternVL2Reader(InternLM2Reader):
     output_weight_key = 'language_model.output.weight'
 
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
+                 model_cfg: dict, **kwargs):
         model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-
-class InternVL2ProReader(Qwen2Reader):
-    """InternVL2 pro reader."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
+        super().__init__(new_params, unused_params, last_bin, model_cfg,
+                         **kwargs)
 
 
 @INPUT_MODELS.register_module(name='internvl')
@@ -65,7 +51,7 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         arch = config.llm_config.architectures[0]
         _readers = dict(InternLM2ForCausalLM=InternVL2Reader,
                         LlamaForCausalLM=InternVLReader,
-                        Qwen2ForCausalLM=InternVL2ProReader)
+                        Qwen2ForCausalLM=InternVLReader)
         self.Reader = _readers[arch]
 
     def model_info(self):
@@ -100,47 +86,3 @@ def model_info(self):
                     max_position_embeddings=max_position_embeddings,
                     use_dynamic_ntk=use_dynamic_ntk,
                     rope_scaling_factor=scaling_factor)
-
-
-class InternVLAwqReader(LlamaAwqReader):
-    """InternVLReader for llama model."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.lm_head.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-
-class InternVL2AwqReader(InternLM2AwqReader):
-    """InternVLReader for InternLM2 model."""
-
-    attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
-    tok_embeddings_key = 'language_model.model.tok_embeddings.weight'
-    norm_weight_key = 'language_model.model.norm.weight'
-    output_weight_key = 'language_model.output.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-
-@INPUT_MODELS.register_module(name='internvl-awq')
-class InternVLAwqModel(InternVLModel):
-    """InternVL model in hf format."""
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        from transformers import AutoConfig
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        arch = config.llm_config.architectures[0]
-        _readers = dict(InternLM2ForCausalLM=InternVL2AwqReader,
-                        LlamaForCausalLM=InternVLAwqReader)
-        self.Reader = _readers[arch]
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index 08b8f214f5..fb94854a45 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -23,7 +23,7 @@ class LlamaReader(BaseReader):
     output_weight_key = 'lm_head.weight'
 
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
+                 model_cfg: dict, policy):
         super().__init__()
         self.params = unused_params
         self.params.update(new_params)
@@ -32,6 +32,7 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
         tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
         if tie_word_embeddings:
             self.output_weight_key = self.tok_embeddings_key
+        self.weight_suffix, self.processor = policy
         self.init_layer_id()
 
     def init_layer_id(self):
@@ -64,32 +65,34 @@ def output_weight(self):
         """Get output."""
         return self.params.get(self.output_weight_key, None)
 
-    def _attn(self, i: int, kind: str, allow_none=False):
+    def _transform(self, x: torch.Tensor, kind: str):
+        return self.processor(x, kind)
+
+    def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
         result = []
         for key in ['q', 'k', 'v', 'o']:
             tensor = self.params.get(
                 f'{self.attn_layer_prefix}.{i}.self_attn.{key}_proj.{kind}')
-            if not allow_none:
-                assert tensor is not None
+            tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
     def attn(self, i: int):
         """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight')
+        return self._attn(i, self.weight_suffix)
 
     def attn_bias(self, i: int):
         """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias', allow_none=True)
+        return self._attn(i, 'bias')
 
     def attn_zero(self, i: int):
         """Get q, k, v, o zero point for layer i."""
-        return (None, ) * 4
+        return self._attn(i, 'qzeros')
 
     def attn_scale(self, i: int):
         """Get q, k, v, o scale for layer i."""
-        return (None, ) * 4
+        return self._attn(i, 'scales')
 
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
@@ -102,20 +105,21 @@ def _ffn(self, i: int, kind: str):
         for key in ['gate', 'down', 'up']:
             tensor = self.params[
                 f'{self.attn_layer_prefix}.{i}.mlp.{key}_proj.{kind}']
+            tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
     def ffn(self, i: int):
         """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
+        return self._ffn(i, self.weight_suffix)
 
     def ffn_zero(self, i: int):
         """Get ffn zero point for layer i."""
-        return (None, ) * 3
+        return self._ffn(i, 'qzeros')
 
     def ffn_scale(self, i: int):
         """Get ffn scale for layer i."""
-        return (None, ) * 3
+        return self._ffn(i, 'scales')
 
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
@@ -132,6 +136,7 @@ class LlamaModel(BaseInputModel):
     def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
         super().__init__(model_path, tokenizer_path)
         ckpt_path = kwargs.get('ckpt_path')
+        self.policy = kwargs.get('input_policy')
         if ckpt_path is None:
             ckpt_path = model_path
         self.ckpt_path = ckpt_path
@@ -169,8 +174,11 @@ def get_mgrs(self):
                                             map_location='cpu')
                 else:
                     new_params = load_file(osp.join(self.ckpt_path, ckpt))
-                ret = self.Reader(new_params, unused_params,
-                                  i == self.nmgrs - 1, self.model_config)
+                ret = self.Reader(new_params,
+                                  unused_params,
+                                  i == self.nmgrs - 1,
+                                  self.model_config,
+                                  policy=self.policy)
                 yield ret
                 ret.clean_up(is_last_bin)
         except GeneratorExit:
diff --git a/lmdeploy/turbomind/deploy/source_model/llama_awq.py b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
deleted file mode 100644
index 1d7fa2149d..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/llama_awq.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
-
-
-def ensure_fp16orint32(tensors: torch.Tensor):
-    """Ensure tensors in fp16/int32 format."""
-    result = []
-    for tensor in tensors:
-        if tensor is not None:
-            if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]:
-                result.append(tensor.half())
-            else:
-                assert tensor.dtype == torch.int32
-                result.append(tensor)
-        else:
-            result.append(None)
-    return (*result, )
-
-
-class LlamaAwqReader(LlamaReader):
-    """LlamaAwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qweight'))
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qzeros'))
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'scales'))
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-@INPUT_MODELS.register_module(name='llama-awq')
-class LlamaAwqModel(LlamaModel):
-    """Llama Awq model in hf format."""
-
-    Reader = LlamaAwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv.py b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
index d249f2b519..a45469a470 100644
--- a/lmdeploy/turbomind/deploy/source_model/minicpmv.py
+++ b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
@@ -17,7 +17,4 @@ class MiniCPMVReader(LlamaReader):
 @INPUT_MODELS.register_module(name='minicpmv')
 class MiniCPMVModel(LlamaModel):
     """MiniCPMV model in hf format."""
-
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-        self.Reader = MiniCPMVReader
+    Reader = MiniCPMVReader
diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv_awq.py b/lmdeploy/turbomind/deploy/source_model/minicpmv_awq.py
deleted file mode 100644
index 3d0a22323d..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/minicpmv_awq.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from .base import INPUT_MODELS
-from .llama_awq import LlamaAwqModel, LlamaAwqReader
-
-
-class MiniCPMVAwqReader(LlamaAwqReader):
-    """MiniCPMVAwqReader model."""
-
-    attn_layer_prefix = 'llm.model.layers'
-    attn_layer_patten = r'llm.model.layers.([0-9]+).'
-    tok_embeddings_key = 'llm.model.embed_tokens.weight'
-    norm_weight_key = 'llm.model.norm.weight'
-    output_weight_key = 'llm.lm_head.weight'
-
-
-@INPUT_MODELS.register_module(name='minicpmv-awq')
-class MiniCPMVAwqModel(LlamaAwqModel):
-    """MiniCPMV awq model in hf format."""
-    Reader = MiniCPMVAwqReader
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 3daf25cce3..311f8e0a85 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -16,35 +16,19 @@ class QwenReader(LlamaReader):
     norm_weight_key = 'transformer.ln_f.weight'
     output_weight_key = 'lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+    def _attn(self, i: int, kind: str):
         """Get q, k, v, o kind for layer i."""
+        q, k, v, o = (None, ) * 4
         qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
-        q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
-        o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
+        qkv = self.transform(qkv, kind)
+        if qkv is not None:
+            q, k, v = torch.split(qkv, qkv.size(0) // 3, dim=0)
+        o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}')
+        o = self.transform(o, kind)
         if o is None:
             o = torch.zeros_like(q)
         return q, k, v, o
 
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight', 0, 0)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return self._attn(i, 'bias', -1, 0)
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o zero point for layer i."""
-        return (None, ) * 4
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scale for layer i."""
-        return (None, ) * 4
-
     def attn_norm(self, i: int):
         """Get attn norm for layer i."""
         return self.params[f'transformer.h.{i}.ln_1.weight']
@@ -54,21 +38,10 @@ def _ffn(self, i: int, kind: str):
         result = []
         for key in ['w2', 'c_proj', 'w1']:
             tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
+            tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
-
-    def ffn_zero(self, i: int):
-        """Get ffn zero point for layer i."""
-        return (None, ) * 3
-
-    def ffn_scale(self, i: int):
-        """Get ffn scale for layer i."""
-        return (None, ) * 3
-
     def ffn_norm(self, i: int):
         """Get ffn norm for layer i."""
         return self.params[f'transformer.h.{i}.ln_2.weight']
@@ -80,9 +53,6 @@ class QwenModel(LlamaModel):
 
     Reader = QwenReader
 
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-
     def tokenizer_info(self):
         """Read tokenizer info."""
         n_words = 151851
@@ -114,43 +84,15 @@ def model_info(self):
                     use_logn_attn=use_logn_attn)
 
 
-class Qwen2Reader(LlamaReader):
-    """read qwen2 model weights.
-
-    The weight name of qwen2 model is similar to llama, except its attention
-    bias doesn't include o_proj bias. Therefore, we make a dummy zero o_proj
-    bias to make it comply the definition of turbomind llama format
-    """
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v bias for layer i."""
-        result = []
-
-        for key in ['q', 'k', 'v']:
-            tensor = self.params.get(
-                f'{self.attn_layer_prefix}.{i}.self_attn.{key}_proj.bias')
-            assert tensor is not None
-            result.append(tensor)
-
-        tensor = self.params.get(
-            f'{self.attn_layer_prefix}.{i}.self_attn.o_proj.weight')
-        dummy_oproj_bias = tensor.new_zeros(tensor.shape[0])
-        result.append(dummy_oproj_bias)
-        return (*result, )
-
-
 @INPUT_MODELS.register_module(name='qwen2')
 class Qwen2Model(LlamaModel):
-    """Qwen model in hf format."""
+    """Qwen model in hf format.
 
-    Reader = Qwen2Reader
+    The weight of qwen2 model is similar to Llama, except its attention bias
+    doesn't include o_proj bias.
+    """
 
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
+    Reader = LlamaReader
 
     def tokenizer_info(self):
         """set tokenizer info.
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
deleted file mode 100644
index 6953b71687..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import INPUT_MODELS
-from .llama_awq import LlamaAwqReader, ensure_fp16orint32
-from .qwen import Qwen2Model, QwenModel, QwenReader
-
-
-class QwenAwqReader(QwenReader):
-    """QwenAwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'bias', -1, 0))
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        # ours: w2(silu(w1(x)) * w3(x))
-        # qwen: c_proj(w1(x) * silu(w2(x)))
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-@INPUT_MODELS.register_module(name='qwen-awq')
-class QwenAwqModel(QwenModel):
-    """Qwen awq model in hf format."""
-
-    Reader = QwenAwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
-
-
-class Qwen2AwqReader(LlamaAwqReader):
-    """read qwen2 awq model weights."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def attn_bias(self, i: int):
-        """Get q, k, v, o bias for layer i."""
-        result = []
-
-        for key in ['q', 'k', 'v']:
-            tensor = self.params.get(
-                f'model.layers.{i}.self_attn.{key}_proj.bias')
-            assert tensor is not None
-            result.append(tensor)
-
-        ref_tensor = result[0]
-        dummy_oproj_bias = ref_tensor.new_zeros(ref_tensor.shape)
-        result.append(dummy_oproj_bias)
-        return ensure_fp16orint32(result)
-
-
-@INPUT_MODELS.register_module(name='qwen2-awq')
-class Qwen2AwqModel(Qwen2Model):
-    """Qwen2 awq model in hf format."""
-
-    Reader = Qwen2AwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
index a577623026..5564c4b937 100644
--- a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
+++ b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
@@ -1,40 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
 
 from .base import INPUT_MODELS
-from .llama import LlamaModel, LlamaReader
+from .internlm2 import InternLM2Model, InternLM2Reader
 
 
-class Xcomposer2Reader(LlamaReader):
+class Xcomposer2Reader(InternLM2Reader):
     """Xcomposer2 model reader."""
 
-    attn_layer_patten = r'model.layers.([0-9]+).'
-    tok_embeddings_key = 'model.tok_embeddings.weight'
-    norm_weight_key = 'model.norm.weight'
-    output_weight_key = 'output.weight'
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
-        """Get q, k, v, o kind for layer i."""
-        kv_head_num = self.model_cfg['num_key_value_heads']
-        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-        qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
-        qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
-        hidden_dim = qkv.shape[-1]
-        q, k, v = torch.split(qkv, [gs, 1, 1], dim=1)
-        q = q.reshape(-1, hidden_dim)
-        k = k.reshape(-1, hidden_dim)
-        v = v.reshape(-1, hidden_dim)
-        o = self.params.get(f'model.layers.{i}.attention.wo.{kind}')
-        return q, k, v, o
-
-    def attn(self, i: int):
-        """Get q, k, v, o weight for layer i."""
-        return self._attn(i, 'weight', 0, 0)
-
     def attn_lora_a(self, i):
         """Get attn lora_a."""
         qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight']
@@ -43,26 +15,7 @@ def attn_lora_a(self, i):
 
     def attn_lora_b(self, i):
         """Get attn lora_b."""
-        return self._attn(i, 'Plora_B.weight', 0, 0)
-
-    def attn_bias(self, i: int):
-        return (None, ) * 4
-
-    def attn_norm(self, i: int):
-        """Get attn norm for layer i."""
-        return self.params[f'model.layers.{i}.attention_norm.weight']
-
-    def _ffn(self, i: int, kind: str):
-        """Get ffn kind for layer i."""
-        result = []
-        for key in ['w1', 'w2', 'w3']:
-            tensor = self.params[f'model.layers.{i}.feed_forward.{key}.{kind}']
-            result.append(tensor)
-        return (*result, )
-
-    def ffn(self, i: int):
-        """Get ffn weight for layer i."""
-        return self._ffn(i, 'weight')
+        return self._attn(i, 'Plora_B.weight')
 
     def ffn_lora_a(self, i: int):
         """Get ffn lora_a weight for layer i."""
@@ -72,20 +25,13 @@ def ffn_lora_b(self, i: int):
         """Get fnn lora_b weight for layer i."""
         return self._ffn(i, 'Plora_B.weight')
 
-    def ffn_norm(self, i: int):
-        """Get ffn norm for layer i."""
-        return self.params[f'model.layers.{i}.ffn_norm.weight']
-
 
 @INPUT_MODELS.register_module(name='xcomposer2')
-class Xcomposer2Model(LlamaModel):
+class Xcomposer2Model(InternLM2Model):
     """Xcomposer2 model in hf format."""
 
     Reader = Xcomposer2Reader
 
-    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
-        super().__init__(model_path, tokenizer_path, **kwargs)
-
     def _lora_cfg_7b(self):
         """lora config for internlm-xcomposer2-7b."""
         return dict(lora_r=256,
diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
deleted file mode 100644
index 5f6d699473..0000000000
--- a/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from .base import INPUT_MODELS
-from .llama_awq import ensure_fp16orint32
-from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader
-
-
-class Xcomposer2AwqReader(Xcomposer2Reader):
-    """LlamaAwqReader."""
-
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
-                 model_cfg: dict):
-        super().__init__(new_params, unused_params, last_bin, model_cfg)
-
-    def _attn(self, i: int, kind: str):
-        """Get q, k, v, o qweight for layer i."""
-        kv_head_num = self.model_cfg['num_key_value_heads']
-        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
-        qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
-        hidden_dim = qkv.shape[0]
-        qkv = qkv.view(hidden_dim, kv_head_num, gs + 2, -1)
-        q, k, v = torch.split(qkv, [gs, 1, 1], dim=-2)
-        q = q.reshape(hidden_dim, -1)
-        k = k.reshape(hidden_dim, -1)
-        v = v.reshape(hidden_dim, -1)
-        o = self.params[f'model.layers.{i}.attention.wo.{kind}']
-        return ensure_fp16orint32((q, k, v, o))
-
-    def attn(self, i: int):
-        """Get q, k, v, o qweight for layer i."""
-        return self._attn(i, 'qweight')
-
-    def attn_zero(self, i: int):
-        """Get q, k, v, o qzeros for layer i."""
-        return self._attn(i, 'qzeros')
-
-    def attn_scale(self, i: int):
-        """Get q, k, v, o scales for layer i."""
-        return self._attn(i, 'scales')
-
-    def attn_lora_a(self, i):
-        """Get attn lora_a."""
-        qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight']
-        o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight']
-        return qkv, o
-
-    def attn_lora_b(self, i):
-        """Get attn lora_b."""
-        return super()._attn(i, 'Plora_B.weight', 0, 0)
-
-    def ffn(self, i: int):
-        """Get ffn qweight for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qweight'))
-
-    def ffn_zero(self, i: int):
-        """Get ffn qzeros for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
-
-    def ffn_scale(self, i: int):
-        """Get ffn scales for layer i."""
-        return ensure_fp16orint32(self._ffn(i, 'scales'))
-
-
-@INPUT_MODELS.register_module(name='xcomposer2-awq')
-class Xcomposer2AwqModel(Xcomposer2Model):
-    """Llama Awq model in hf format."""
-
-    Reader = Xcomposer2AwqReader
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: str,
-                 ckpt_path: str = None,
-                 **kwargs):
-        super().__init__(model_path,
-                         tokenizer_path,
-                         ckpt_path=ckpt_path,
-                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py
index c40f9224ff..505c70de30 100644
--- a/lmdeploy/turbomind/deploy/target_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/target_model/__init__.py
@@ -1,5 +1,2 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .fp import TurbomindModel  # noqa: F401
-from .plora import TurbomindPloraModel  # noqa: F401
-from .plora_w4 import TurbomindPloraW4Model  # noqa: F401
-from .w4 import TurbomindW4Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index ef1473bbe6..f969055759 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -164,6 +164,7 @@ class BaseOutputModel(ABC):
     def __init__(self,
                  input_model: BaseInputModel,
                  cfg: TurbomindModelConfig,
+                 exporter_factory,
                  out_dir: str = ''):
         super().__init__()
         self.input_model = input_model
@@ -171,11 +172,14 @@ def __init__(self,
         if not cfg.valid:
             self.cfg = self.get_config(cfg)
         assert self.cfg.valid
+        assert self.cfg.kv_head_num % self.cfg.tensor_para_size == 0
         self.out_dir = out_dir
         self.to_file = True if out_dir else False
         self.tm_params = {}
         model_info = self.input_model.model_info()
         self.permute_qk = model_info.get('permute_qk', True)
+        # ! Dependency on `self`
+        self.exporters = exporter_factory(self)
 
     @abstractmethod
     def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
@@ -252,14 +256,28 @@ def save_split(self,
                    name: str,
                    split_dim=None,
                    copy=False) -> None:
-        """save split."""
+        """save split.
+
+        - 2D input
+            shape must be (input_dims, output_dims)
+        - 1D input (bias)
+            shape must be (output_dims)
+            split is skipped when split_dim == 0
+        """
+
+        if copy or (tensor.dim() == 1 and split_dim == 0):
+            split_dim = None
+            copy = True
+
         tp = self.cfg.tensor_para_size
         if split_dim is not None:
             tprint(
                 f'*** splitting {name}, shape={tensor.shape}, '
                 f'split_dim={split_dim}, tp={tp}',
                 to_file=self.to_file)
-            assert tensor.shape[split_dim] % tp == 0
+            if tensor.shape[split_dim] % tp != 0:
+                raise RuntimeError(
+                    f'{name}: shape={list(tensor.shape)}, tp={tp}')
             split_size = tensor.shape[split_dim] // tp
             splits = torch.split(tensor, split_size, dim=split_dim)
             for i, split in enumerate(splits):
@@ -322,31 +340,7 @@ def pad_weight(tensor):
             output_weight = pad_weight(output_weight)
             self.export_weight(output_weight, 'output.weight')
 
-    @abstractmethod
     def export_transformer_block(self, bin: BaseReader, i: int) -> None:
         """Export transformer block."""
-        pass
-
-
-def permute(x: torch.Tensor, size_per_head: int = 128):
-    if x.shape[-1] > 1:
-        dim = x.shape[-1]
-        n_heads = dim // size_per_head
-        return x.view(-1, n_heads, 2,
-                      dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
-    else:  # scales, zeros
-        dim = x.shape[0]
-        n_heads = dim // size_per_head
-        return x.view(n_heads, 2, dim // n_heads // 2,
-                      1).transpose(1, 2).reshape(dim, 1)
-
-
-def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
-              dim: int):
-
-    def reshape(x):
-        return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
-
-    qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
-    # (input_dim, head_num + 2 * kv_head_num)
-    return qkv.view(q.size(0), -1)
+        for e in self.exporters:
+            e.export(bin, i)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
index 981a443c33..57c958fd36 100644
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -1,84 +1,36 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
-import torch
+from .base import OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig
 
-from ..source_model.base import BaseInputModel, BaseReader
-from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
-                   merge_qkv, permute)
 
-
-def transpose_tensor(input: List[torch.Tensor]):
-    """Transpose tensor."""
-    output = [x.cuda().t() for x in input]
-    return output
-
-
-@OUTPUT_MODELS.register_module(name=['fp16', 'bf16'])
+@OUTPUT_MODELS.register_module(name='tm')
 class TurbomindModel(BaseOutputModel):
     """Export to turbomind fp16 format."""
 
-    def __init__(self,
-                 input_model: BaseInputModel,
-                 cfg: TurbomindModelConfig,
-                 out_dir: str = ''):
-        super().__init__(input_model, cfg, out_dir)
-
     def get_config(self, cfg: TurbomindModelConfig):
         """Get turbomind config."""
         final_cfg = super().get_config(cfg).__dict__
-
         # attn_bias, inter_size
         visit = False
         attn_bias = 0
         for bin in self.input_model.bins():
             for i in range(bin.start_layer_id, bin.end_layer_id):
                 visit = True
-                w1, _, _ = bin.ffn(i)
-                inter_size = w1.t().shape[-1]
+                w1, w2, w3 = bin.ffn(i)
+                inter_size = w2.size(-1)
                 qb, _, _, _ = bin.attn_bias(i)
                 if qb is not None:
                     attn_bias = 1
                 break
             if visit:
                 break
+        inter_size = self._pad_inter_size(inter_size, final_cfg)
         final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
         return TurbomindModelConfig.from_dict(final_cfg)
 
-    def export_transformer_block(self, bin: BaseReader, i: int):
-        """Export transformer layer i."""
-        assert bin.start_layer_id <= i < bin.end_layer_id
-        tp = self.cfg.tensor_para_size
-        size_per_head = self.cfg.size_per_head
-        # attn
-        qw, kw, vw, ow = bin.attn(i)
-        qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
-        if self.permute_qk:
-            qw = permute(qw, size_per_head)
-            kw = permute(kw, size_per_head)
-        qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
-        self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
-        self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
-        qb, kb, vb, ob = bin.attn_bias(i)
-        if qb is not None:
-            qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
-            if self.permute_qk:
-                qb = permute(qb, size_per_head)
-                kb = permute(kb, size_per_head)
-            else:  # simulate the side-effect of `permute`
-                qb = qb[None, :]
-                kb = kb[None, :]
-            qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
-            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
-            self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
-        # ffn
-        w1, w2, w3 = bin.ffn(i)
-        w1, w2, w3 = transpose_tensor([w1, w2, w3])
-        self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
-        self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
-        self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
-        # norm
-        attn_norm = bin.attn_norm(i)
-        ffn_norm = bin.ffn_norm(i)
-        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
+    def _pad_inter_size(self, inter_size: int, cfg: dict):
+        group_size = max(1, cfg['group_size'])
+        tp = cfg['tensor_para_size']
+        groups_per_rank = (inter_size // group_size + tp - 1) // tp
+        inter_size_padded = groups_per_rank * group_size * tp
+        return inter_size_padded
diff --git a/lmdeploy/turbomind/deploy/target_model/plora.py b/lmdeploy/turbomind/deploy/target_model/plora.py
deleted file mode 100644
index 38edbb9a29..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/plora.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
-
-import torch
-
-from ..source_model.base import BaseInputModel, BaseReader
-from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
-                   merge_qkv, permute)
-
-
-def transpose_tensor(input: List[torch.Tensor]):
-    """Transpose tensor."""
-    output = [x.cuda().t() for x in input]
-    return output
-
-
-@OUTPUT_MODELS.register_module(name=['plora'])
-class TurbomindPloraModel(BaseOutputModel):
-    """Export to turbomind fp16 format."""
-
-    def __init__(self,
-                 input_model: BaseInputModel,
-                 cfg: TurbomindModelConfig,
-                 out_dir: str = ''):
-        super().__init__(input_model, cfg, out_dir)
-
-    def get_config(self, cfg: TurbomindModelConfig):
-        """Get turbomind config."""
-        final_cfg = super().get_config(cfg).__dict__
-
-        # attn_bias, inter_size
-        visit = False
-        attn_bias = 0
-        for bin in self.input_model.bins():
-            for i in range(bin.start_layer_id, bin.end_layer_id):
-                visit = True
-                w1, _, _ = bin.ffn(i)
-                inter_size = w1.t().shape[-1]
-                qb, _, _, _ = bin.attn_bias(i)
-                if qb is not None:
-                    attn_bias = 1
-                break
-            if visit:
-                break
-        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
-        return TurbomindModelConfig.from_dict(final_cfg)
-
-    def export_transformer_block(self, bin: BaseReader, i: int):
-        """Export transformer layer i."""
-        assert bin.start_layer_id <= i < bin.end_layer_id
-        tp = self.cfg.tensor_para_size
-        size_per_head = self.cfg.size_per_head
-        # attn
-        qw, kw, vw, ow = bin.attn(i)
-        qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
-        qw = permute(qw, size_per_head)
-        kw = permute(kw, size_per_head)
-        qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
-        self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
-        self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
-        qb, kb, vb, ob = bin.attn_bias(i)
-        # attn lora_a
-        lora_a_qkv, lora_a_o = bin.attn_lora_a(i)
-        lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o])
-        # print(lora_a_qkv.shape, lora_a_o.shape)
-        self.save_split(lora_a_qkv,
-                        f'layers.{i}.attention.w_qkv.lora_a.weight',
-                        copy=True)
-        self.save_split(lora_a_o, f'layers.{i}.attention.wo.lora_a.weight', 0)
-        # attn lora_b
-        lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = bin.attn_lora_b(i)
-        lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = transpose_tensor(
-            [lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow])
-        lora_b_qw = permute(lora_b_qw, size_per_head)
-        lora_b_kw = permute(lora_b_kw, size_per_head)
-        lora_b_qkv_w = merge_qkv(lora_b_qw, lora_b_kw, lora_b_vw, tp, dim=2)
-        self.save_split(lora_b_qkv_w,
-                        f'layers.{i}.attention.w_qkv.lora_b.weight', -1)
-        self.save_split(lora_b_ow,
-                        f'layers.{i}.attention.wo.lora_b.weight',
-                        copy=True)
-
-        if qb is not None:
-            qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
-            qb = permute(qb, size_per_head)
-            kb = permute(kb, size_per_head)
-            qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
-            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
-            self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
-        # ffn
-        w1, w2, w3 = bin.ffn(i)
-        w1, w2, w3 = transpose_tensor([w1, w2, w3])
-        self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
-        self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
-        self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
-        # # ffn lora_a
-        lora_a_w1, lora_a_w2, lora_a_w3 = bin.ffn_lora_a(i)
-        lora_a_w1, lora_a_w2, lora_a_w3 = transpose_tensor(
-            [lora_a_w1, lora_a_w2, lora_a_w3])
-        # print('lora_a_w1', lora_a_w1.shape, lora_a_w2.shape, lora_a_w3.shape)
-        self.save_split(lora_a_w1,
-                        f'layers.{i}.feed_forward.w1.lora_a.weight',
-                        copy=True)
-        self.save_split(lora_a_w3,
-                        f'layers.{i}.feed_forward.w3.lora_a.weight',
-                        copy=True)
-        self.save_split(lora_a_w2, f'layers.{i}.feed_forward.w2.lora_a.weight',
-                        0)
-        # # ffn lora_b
-        lora_b_w1, lora_b_w2, lora_b_w3 = bin.ffn_lora_b(i)
-        lora_b_w1, lora_b_w2, lora_b_w3 = transpose_tensor(
-            [lora_b_w1, lora_b_w2, lora_b_w3])
-        # print('lora_b_w1', lora_b_w1.shape, lora_b_w2.shape, lora_b_w3.shape)
-        self.save_split(lora_b_w1, f'layers.{i}.feed_forward.w1.lora_b.weight',
-                        -1)
-        self.save_split(lora_b_w3, f'layers.{i}.feed_forward.w3.lora_b.weight',
-                        -1)
-        self.save_split(lora_b_w2,
-                        f'layers.{i}.feed_forward.w2.lora_b.weight',
-                        copy=True)
-
-        # norm
-        attn_norm = bin.attn_norm(i)
-        ffn_norm = bin.ffn_norm(i)
-        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/deploy/target_model/plora_w4.py b/lmdeploy/turbomind/deploy/target_model/plora_w4.py
deleted file mode 100644
index 0adf437e28..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/plora_w4.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..source_model.base import BaseInputModel, BaseReader
-from .base import OUTPUT_MODELS, TurbomindModelConfig, merge_qkv, permute
-from .plora import TurbomindPloraModel, transpose_tensor
-from .w4 import convert_s4, get_cuda_tensor, tp_m_s4, transpose_qk_s4
-
-
-@OUTPUT_MODELS.register_module(name=['plora-w4'])
-class TurbomindPloraW4Model(TurbomindPloraModel):
-    """Export to turbomind plora w4 format."""
-
-    def __init__(self,
-                 input_model: BaseInputModel,
-                 cfg: TurbomindModelConfig,
-                 out_dir: str = ''):
-        super().__init__(input_model, cfg, out_dir)
-
-    def get_config(self, cfg: TurbomindModelConfig):
-        """Get turbomind config."""
-        final_cfg = super().get_config(cfg).__dict__
-
-        # attn_bias, inter_size
-        visit = False
-        attn_bias = 0
-        for bin in self.input_model.bins():
-            for i in range(bin.start_layer_id, bin.end_layer_id):
-                visit = True
-                w1s, _, _ = bin.ffn_scale(i)
-                inter_size = w1s.shape[-1]
-                qb, _, _, _ = bin.attn_bias(i)
-                if qb is not None:
-                    attn_bias = 1
-                break
-            if visit:
-                break
-        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
-        return TurbomindModelConfig.from_dict(final_cfg)
-
-    def export_transformer_block(self, bin: BaseReader, i: int):
-        """Export transformer layer i."""
-        assert bin.start_layer_id <= i < bin.end_layer_id
-        group_size = self.cfg.group_size
-        tp = self.cfg.tensor_para_size
-        size_per_head = self.cfg.size_per_head
-        # attn
-        q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
-        q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
-        q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
-
-        q_qw = transpose_qk_s4(q_qw, group_size)
-        k_qw = transpose_qk_s4(k_qw, group_size)
-        q_qz = transpose_qk_s4(q_qz, group_size)
-        k_qz = transpose_qk_s4(k_qz, group_size)
-        q_s = permute(q_s, size_per_head)
-        k_s = permute(k_s, size_per_head)
-
-        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
-        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
-        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
-
-        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
-        qkv_qw = tp_m_s4(qkv_qw, tp)
-        self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
-        self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
-        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
-        self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
-        self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
-
-        q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
-        if q_b is not None:
-            q_b = permute(q_b, size_per_head)
-            k_b = permute(k_b, size_per_head)
-            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
-            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
-            self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
-
-        # ffn weights
-        w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
-        w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
-        w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
-
-        w1_qw, w1_sz = convert_s4(w1_qw, w1_qz, w1_s, group_size)
-        w3_qw, w3_sz = convert_s4(w3_qw, w3_qz, w3_s, group_size)
-        w1_qw = tp_m_s4(w1_qw, tp)
-        w3_qw = tp_m_s4(w3_qw, tp)
-        self.save_split(w1_qw, f'layers.{i}.feed_forward.w1.qweight', -1)
-        self.save_split(w1_sz, f'layers.{i}.feed_forward.w1.scales_zeros', -1)
-        self.save_split(w3_qw, f'layers.{i}.feed_forward.w3.qweight', -1)
-        self.save_split(w3_sz, f'layers.{i}.feed_forward.w3.scales_zeros', -1)
-
-        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
-        self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
-        self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
-
-        # attn lora_a
-        lora_a_qkv, lora_a_o = bin.attn_lora_a(i)
-        lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o])
-        # print(lora_a_qkv.shape, lora_a_o.shape)
-        self.save_split(lora_a_qkv,
-                        f'layers.{i}.attention.w_qkv.lora_a.weight',
-                        copy=True)
-        self.save_split(lora_a_o, f'layers.{i}.attention.wo.lora_a.weight', 0)
-        # attn lora_b
-        lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = bin.attn_lora_b(i)
-        lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = transpose_tensor(
-            [lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow])
-        lora_b_qw = permute(lora_b_qw, size_per_head)
-        lora_b_kw = permute(lora_b_kw, size_per_head)
-        lora_b_qkv_w = merge_qkv(lora_b_qw, lora_b_kw, lora_b_vw, tp, dim=2)
-        self.save_split(lora_b_qkv_w,
-                        f'layers.{i}.attention.w_qkv.lora_b.weight', -1)
-        self.save_split(lora_b_ow,
-                        f'layers.{i}.attention.wo.lora_b.weight',
-                        copy=True)
-
-        # # ffn lora_a
-        lora_a_w1, lora_a_w2, lora_a_w3 = bin.ffn_lora_a(i)
-        lora_a_w1, lora_a_w2, lora_a_w3 = transpose_tensor(
-            [lora_a_w1, lora_a_w2, lora_a_w3])
-        # print('lora_a_w1', lora_a_w1.shape, lora_a_w2.shape, lora_a_w3.shape)
-        self.save_split(lora_a_w2, f'layers.{i}.feed_forward.w2.lora_a.weight',
-                        0)
-        self.save_split(lora_a_w1,
-                        f'layers.{i}.feed_forward.w1.lora_a.weight',
-                        copy=True)
-        self.save_split(lora_a_w3,
-                        f'layers.{i}.feed_forward.w3.lora_a.weight',
-                        copy=True)
-        # # ffn lora_b
-        lora_b_w1, lora_b_w2, lora_b_w3 = bin.ffn_lora_b(i)
-        lora_b_w1, lora_b_w2, lora_b_w3 = transpose_tensor(
-            [lora_b_w1, lora_b_w2, lora_b_w3])
-        # print('lora_b_w1', lora_b_w1.shape, lora_b_w2.shape, lora_b_w3.shape)
-        self.save_split(lora_b_w1, f'layers.{i}.feed_forward.w1.lora_b.weight',
-                        -1)
-        self.save_split(lora_b_w3, f'layers.{i}.feed_forward.w3.lora_b.weight',
-                        -1)
-        self.save_split(lora_b_w2,
-                        f'layers.{i}.feed_forward.w2.lora_b.weight',
-                        copy=True)
-
-        # norm
-        attn_norm = bin.attn_norm(i)
-        ffn_norm = bin.ffn_norm(i)
-        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/deploy/target_model/w4.py b/lmdeploy/turbomind/deploy/target_model/w4.py
deleted file mode 100644
index 555ca69398..0000000000
--- a/lmdeploy/turbomind/deploy/target_model/w4.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import sys
-
-import torch
-
-import lmdeploy
-
-from ..source_model.base import BaseInputModel, BaseReader
-from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
-                   merge_qkv, permute)
-
-# import _turbomind as _tm
-# TODO: find another way import _turbomind
-lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
-sys.path.append(osp.join(lmdeploy_dir, 'lib'))
-import _turbomind as _tm  # noqa: E402
-
-
-def transpose_qk_s4(src: torch.Tensor, group_size):
-    assert src.is_contiguous()
-    dst = torch.zeros_like(src)
-    _tm.transpose_qk_s4_k_m8(src, dst,
-                             src.size(-1) * 8, src.size(0), group_size)
-    return dst
-
-
-def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor,
-                  w3_qw: torch.Tensor, w3_qz: torch.Tensor,
-                  w3_s: torch.Tensor):
-
-    def fuse(a: torch.Tensor, b: torch.Tensor):
-        ab = torch.cat((a, b)).contiguous()
-        _ab = torch.zeros_like(ab)
-        _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
-        return _ab.view(a.size(0), -1)
-
-    w13_qw = fuse(w1_qw, w3_qw)
-    w13_qz = fuse(w1_qz, w3_qz)
-
-    w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
-    w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
-
-    return w13_qw, w13_qz, w13_s
-
-
-def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
-               group_size: int):
-    assert qw.is_contiguous()
-    assert qz.is_contiguous()
-    assert s.is_contiguous()
-    _qw = torch.zeros_like(qw)
-    _sz = torch.zeros_like(s, dtype=torch.int32)  # half2
-    _ws = torch.zeros_like(s)
-    _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
-                        qw.size(-1) * 8, qw.size(0), group_size)
-    return _qw, _sz
-
-
-def tp_m_s4(x: torch.Tensor, tp: int):
-    return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
-                                                        1).contiguous()
-
-
-def get_cuda_tensor(tensors):
-    """Get cuda tensor."""
-    result = map(lambda x: x.cuda() if x is not None else x, tensors)
-    return (*result, )
-
-
-@OUTPUT_MODELS.register_module(name='w4')
-class TurbomindW4Model(BaseOutputModel):
-    """Export to turbomind w4a16 format."""
-
-    def __init__(self,
-                 input_model: BaseInputModel,
-                 cfg: TurbomindModelConfig,
-                 out_dir: str = ''):
-        super().__init__(input_model, cfg, out_dir)
-
-    def get_config(self, cfg: TurbomindModelConfig):
-        """Get turbomind config."""
-        final_cfg = super().get_config(cfg).__dict__
-
-        # attn_bias, inter_size
-        visit = False
-        attn_bias = 0
-        for bin in self.input_model.bins():
-            for i in range(bin.start_layer_id, bin.end_layer_id):
-                visit = True
-                w1s, _, _ = bin.ffn_scale(i)
-                inter_size = w1s.shape[-1]
-                qb, _, _, _ = bin.attn_bias(i)
-                if qb is not None:
-                    attn_bias = 1
-                break
-            if visit:
-                break
-        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
-        return TurbomindModelConfig.from_dict(final_cfg)
-
-    def export_transformer_block(self, bin: BaseReader, i: int):
-        """Export transformer layer i."""
-        group_size = self.cfg.group_size
-        tp = self.cfg.tensor_para_size
-        size_per_head = self.cfg.size_per_head
-        # attn
-        q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
-        q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
-        q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
-
-        if self.permute_qk:
-            q_qw = transpose_qk_s4(q_qw, group_size)
-            k_qw = transpose_qk_s4(k_qw, group_size)
-            q_qz = transpose_qk_s4(q_qz, group_size)
-            k_qz = transpose_qk_s4(k_qz, group_size)
-            q_s = permute(q_s, size_per_head)
-            k_s = permute(k_s, size_per_head)
-
-        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
-        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
-        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
-
-        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
-        qkv_qw = tp_m_s4(qkv_qw, tp)
-        self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
-        self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
-
-        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
-        self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
-        self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
-
-        q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
-        if q_b is not None:
-            if self.permute_qk:
-                q_b = permute(q_b, size_per_head)
-                k_b = permute(k_b, size_per_head)
-            else:
-                q_b = q_b[None, :]
-                k_b = k_b[None, :]
-            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
-            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
-            self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
-
-        # ffn weights
-        w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
-        w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
-        w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
-
-        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
-                                              w3_s)
-        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
-        w13_qw = tp_m_s4(w13_qw, tp)
-        self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
-        self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
-                        -1)
-
-        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
-        self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
-        self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
-
-        # norm
-        attn_norm = bin.attn_norm(i)
-        ffn_norm = bin.ffn_norm(i)
-        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
-        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 5fd9824a03..18317eee0d 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -4,6 +4,7 @@
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from configparser import ConfigParser
+from itertools import repeat
 from queue import LifoQueue, Queue
 from typing import Dict, Iterable, List, Union
 
@@ -103,6 +104,20 @@ def __init__(self,
                                             model_path=model_path,
                                             engine_config=engine_config)
 
+        with ThreadPoolExecutor(max_workers=self.gpu_count) as e:
+            ranks = [
+                self.node_id * self.gpu_count + device_id
+                for device_id in range(self.gpu_count)
+            ]
+            for _ in e.map(self.model_comm.process_weight,
+                           range(self.gpu_count), ranks):
+                pass
+            # implicit synchronization
+            for _ in e.map(self.model_comm.create_engine,
+                           range(self.gpu_count), ranks,
+                           repeat(self.nccl_params)):
+                pass
+
         self.session_len = self.config.session_len
         self.eos_id = self.tokenizer.eos_token_id
 
@@ -172,6 +187,13 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
                 if quant_method == 'awq' and group_size == 128 and \
                         version == 'gemm':
                     engine_config.model_format = 'awq'
+                elif all((quant_method == 'gptq', group_size == 128,
+                          not quant_config.get('desc_act', False),
+                          quant_config.get('sym', True))):
+                    engine_config.model_format = 'gptq'
+                else:
+                    raise AssertionError(
+                        f'unsupported quant config: {quant_config}')
 
         assert is_supported(model_path), (
             f'turbomind does not support {model_path}. '
@@ -206,7 +228,6 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
                 'the model may not be loaded successfully '
                 f'with {len(tm_params)} uninitialized params:\n{uninitialized}'
             )
-
         return model_comm
 
     def _from_workspace(self, model_path: str,
diff --git a/src/turbomind/kernels/CMakeLists.txt b/src/turbomind/kernels/CMakeLists.txt
index b7ef1c7252..818ef87043 100644
--- a/src/turbomind/kernels/CMakeLists.txt
+++ b/src/turbomind/kernels/CMakeLists.txt
@@ -58,8 +58,9 @@ add_library(custom_ar_kernels STATIC custom_ar_kernels.cu)
 set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 
-add_subdirectory(gemm_s_f16)
 if (BUILD_TEST)
     add_subdirectory(flash_attention)
 endif ()
+
 add_subdirectory(attention)
+add_subdirectory(gemm)
diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu
index 0bd76c36a9..ffb67ae2a3 100644
--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -15,6 +15,9 @@
  */
 
 #include "src/turbomind/kernels/activation_kernels.h"
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -319,4 +322,61 @@ INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float);
 INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16);
 #endif
 
+// `output` may be an alias of `inter_buf`
+template<int VecSize, template<typename T> class Activation, typename T>
+__global__ void activation_kernel(T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims)
+{
+    const int di = threadIdx.x + blockIdx.x * blockDim.x;
+    const int ti = blockIdx.y;
+
+    dims /= VecSize;
+
+    if (di >= dims) {
+        return;
+    }
+
+    using Vec = Array<T, VecSize>;
+
+    auto p_inter = reinterpret_cast<Vec*>(inter_buf + ti * stride);
+    auto p_gate  = reinterpret_cast<const Vec*>(gate_buf + ti * stride);
+
+    Vec inter;
+    Load(inter, (T*)&p_inter[di]);
+
+    Vec gate;
+    Ldg(gate, (const T*)&p_gate[di]);
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < VecSize; ++i) {
+        inter[i] = Activation<T>::apply(inter[i]) * gate[i];
+    }
+
+    Store((T*)&p_inter[di], inter);
+}
+
+template<template<typename T> class Activation, typename T>
+void invokeGenericActivation_v2(
+    T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream)
+{
+    constexpr int kVecSize = 4;
+
+    constexpr int block = 256;
+    const dim3    grid(ceil_div(dims, block * kVecSize), token_num);
+
+    activation_kernel<kVecSize, Activation, T>
+        <<<grid, block, 0, stream>>>(inter_buf, gate_buf, stride, token_num, dims);
+}
+
+#define INSTANTIATE_ACTIVATION(Activation, T)                                                                          \
+    template void invokeGenericActivation_v2<SiluActivation>(                                                          \
+        T * inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream)
+
+INSTANTIATE_ACTIVATION(SiluActivation, half);
+#ifdef ENABLE_FP32
+INSTANTIATE_ACTIVATION(SiluActivation, float);
+#endif
+#ifdef ENABLE_BF16
+INSTANTIATE_ACTIVATION(SiluActivation, __nv_bfloat16);
+#endif
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/activation_kernels.h b/src/turbomind/kernels/activation_kernels.h
index 776b614c9c..1197ee4806 100644
--- a/src/turbomind/kernels/activation_kernels.h
+++ b/src/turbomind/kernels/activation_kernels.h
@@ -76,6 +76,10 @@ void invokeGenericActivation(T*           out,
                                                stream);
 }
 
+template<template<typename T> class Activation, typename T>
+void invokeGenericActivation_v2(
+    T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream);
+
 template<typename T>
 void invokeAddBiasGeluV2(T*           out,
                          const T*     bias,
diff --git a/src/turbomind/kernels/attention/attention_universal.h b/src/turbomind/kernels/attention/attention_universal.h
index 1d6466ba80..352cc14725 100644
--- a/src/turbomind/kernels/attention/attention_universal.h
+++ b/src/turbomind/kernels/attention/attention_universal.h
@@ -2,13 +2,12 @@
 
 #pragma once
 
-#include "array_ops.h"
-
-#include "block.h"
-#include "iterator.h"
 #include "quantization.h"
-#include "reduce_kernel.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/attention/reduce_kernel.h"
+#include "src/turbomind/kernels/attention/rotary_embedding.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/sync.h"
 #include <limits>
 #include <type_traits>
 
diff --git a/src/turbomind/kernels/attention/block.h b/src/turbomind/kernels/attention/block.h
index ad4f299aae..75c1626383 100644
--- a/src/turbomind/kernels/attention/block.h
+++ b/src/turbomind/kernels/attention/block.h
@@ -2,20 +2,12 @@
 
 #pragma once
 
-#include "data_type.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/sub_byte_ptr.h"
 #include <iostream>
 #include <type_traits>
 
-#if defined(__CUDACC__)
-#define TM_HOST_DEVICE __forceinline__ __host__ __device__
-#define TM_DEVICE __forceinline__ __device__
-#define TM_HOST __forceinline__ __host__
-#else
-#define TM_HOST_DEVICE inline
-#define TM_DEVICE inline
-#define TM_HOST inline
-#endif
-
 namespace turbomind {
 
 namespace block {
diff --git a/src/turbomind/kernels/attention/decoding_template.h b/src/turbomind/kernels/attention/decoding_template.h
index b6a3a588ef..b9af727f86 100644
--- a/src/turbomind/kernels/attention/decoding_template.h
+++ b/src/turbomind/kernels/attention/decoding_template.h
@@ -5,7 +5,7 @@
 #include "attention_params.h"
 #include "attention_universal.h"
 #include "reduce.h"
-#include "src/turbomind/kernels/attention/thread_map.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 #include "utils.h"
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/impl_16816.h b/src/turbomind/kernels/attention/impl_16816.h
index f0ceba0070..69e0a6a48c 100644
--- a/src/turbomind/kernels/attention/impl_16816.h
+++ b/src/turbomind/kernels/attention/impl_16816.h
@@ -2,13 +2,12 @@
 
 #pragma once
 
-#include "array_ops.h"
-#include "impl.h"
-#include "impl_m16n8.h"
-#include "iterator.h"
-#include "src/turbomind/kernels/attention/thread_map.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
-#include <cuda_pipeline_primitives.h>
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/impl_m16n8.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 
 namespace turbomind::attention {
 
diff --git a/src/turbomind/kernels/attention/impl_1688.h b/src/turbomind/kernels/attention/impl_1688.h
index c6b948b0fe..856ddcd587 100644
--- a/src/turbomind/kernels/attention/impl_1688.h
+++ b/src/turbomind/kernels/attention/impl_1688.h
@@ -2,11 +2,12 @@
 
 #pragma once
 
-#include "impl.h"
-#include "impl_m16n8.h"
-#include "iterator.h"
-#include "src/turbomind/kernels/attention/thread_map.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/attention/impl_m16n8.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 
 namespace turbomind::attention {
 
diff --git a/src/turbomind/kernels/attention/impl_81616.h b/src/turbomind/kernels/attention/impl_81616.h
index 528d1690e4..0c0baa531a 100644
--- a/src/turbomind/kernels/attention/impl_81616.h
+++ b/src/turbomind/kernels/attention/impl_81616.h
@@ -2,12 +2,13 @@
 
 #pragma once
 
-#include "array_ops.h"
-#include "impl.h"
-#include "iterator.h"
+#include "src/turbomind/kernels/attention/impl.h"
 #include "src/turbomind/kernels/attention/quantization.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
-#include "thread_map.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 #include <type_traits>
 
 namespace turbomind::attention {
diff --git a/src/turbomind/kernels/attention/impl_884.h b/src/turbomind/kernels/attention/impl_884.h
index b10323b287..9f23296032 100644
--- a/src/turbomind/kernels/attention/impl_884.h
+++ b/src/turbomind/kernels/attention/impl_884.h
@@ -2,57 +2,16 @@
 
 #pragma once
 
-#include "array_ops.h"
-#include "impl.h"
-#include "src/turbomind/kernels/attention/iterator.h"
-#include "src/turbomind/kernels/attention/thread_map.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/core/thread_map.h"
+
 #include <cmath>
 
 namespace turbomind::attention {
 
-__inline__ __device__ void
-mma_m8n8k4_row_col(Array<float, 8>& d, const Array<half, 4>& a, const Array<half, 4>& b, Array<float, 8>& c)
-{
-#if TURBOMIND_ARCH_SM70
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    // clang-format off
-    asm volatile(
-        "mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32"
-        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-        "{%8,  %9},"
-        "{%10, %11},"
-        "{%12, %13, %14, %15, %16, %17, %18, %19};"
-        : "=f"(d[0]), "=f"(d[1]), "=f"(d[2]), "=f"(d[3]), "=f"(d[4]), "=f"(d[5]), "=f"(d[6]), "=f"(d[7])
-        : "r"(A[0]), "r"(A[1]),
-          "r"(B[0]), "r"(B[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]));
-// clang-format on
-#endif
-}
-
-__inline__ __device__ void
-mma_m8n8k4_row_row(Array<float, 8>& d, const Array<half, 4>& a, const Array<half, 4>& b, Array<float, 8>& c)
-{
-#if TURBOMIND_ARCH_SM70
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    // clang-format off
-    asm volatile(
-        "mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32"
-        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
-        "{%8,  %9},"
-        "{%10, %11},"
-        "{%12, %13, %14, %15, %16, %17, %18, %19};"
-        : "=f"(d[0]), "=f"(d[1]), "=f"(d[2]), "=f"(d[3]), "=f"(d[4]), "=f"(d[5]), "=f"(d[6]), "=f"(d[7])
-        : "r"(A[0]), "r"(A[1]),
-          "r"(B[0]), "r"(B[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]));
-// clang-format on
-#endif
-}
-
 template<class T_, int CTA_H_, int CTA_Q_, int CTA_S_, int WARP_H_, int WARP_Q, int WARP_S, int HeadDim>
 struct Impl<MMA_884, T_, T_, CTA_H_, CTA_Q_, CTA_S_, WARP_H_, WARP_Q, WARP_S, HeadDim> {
     using T   = T_;
diff --git a/src/turbomind/kernels/attention/impl_m16n8.h b/src/turbomind/kernels/attention/impl_m16n8.h
index 9cb782394a..78594616d2 100644
--- a/src/turbomind/kernels/attention/impl_m16n8.h
+++ b/src/turbomind/kernels/attention/impl_m16n8.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/array.h"
 
 namespace turbomind::attention {
 
diff --git a/src/turbomind/kernels/attention/impl_simt.h b/src/turbomind/kernels/attention/impl_simt.h
index 60efd922b7..a886185a44 100644
--- a/src/turbomind/kernels/attention/impl_simt.h
+++ b/src/turbomind/kernels/attention/impl_simt.h
@@ -2,12 +2,10 @@
 
 #pragma once
 
-#include "array_ops.h"
-#include "impl.h"
-#include "iterator.h"
-#include "src/turbomind/kernels/attention/data_type.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
-#include "thread_map.h"
+#include "src/turbomind/kernels/attention/impl.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 #include <limits>
 #include <type_traits>
 
diff --git a/src/turbomind/kernels/attention/iterator.h b/src/turbomind/kernels/attention/iterator.h
index bf08900752..2bf4e4e085 100644
--- a/src/turbomind/kernels/attention/iterator.h
+++ b/src/turbomind/kernels/attention/iterator.h
@@ -2,10 +2,10 @@
 
 #pragma once
 
-#include "../gemm_s_f16/common.h"
-#include "array_ops.h"
-#include "smem_layout.h"
-#include "src/turbomind/kernels/attention/data_type.h"
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/pipe_iter.h"
 #include <type_traits>
 
 namespace turbomind {
@@ -124,22 +124,4 @@ struct CombinedIterator {
     }
 };
 
-template<int Stages, int Step = 1>
-struct PipeIter {
-    static constexpr int kMaxStep = Stages * Step;
-
-    int r = 0;
-    int w = kMaxStep - Step;
-
-    __inline__ __device__ PipeIter& operator++()
-    {
-        w = r;
-        r += Step;
-        if (r == kMaxStep) {
-            r -= kMaxStep;
-        }
-        return *this;
-    }
-};
-
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/iterator_sm70.h b/src/turbomind/kernels/attention/iterator_sm70.h
index 577c6e0504..14a67e57fe 100644
--- a/src/turbomind/kernels/attention/iterator_sm70.h
+++ b/src/turbomind/kernels/attention/iterator_sm70.h
@@ -2,8 +2,8 @@
 
 #pragma once
 
-#include "array_ops.h"
 #include "iterator.h"
+#include "src/turbomind/kernels/core/array_ops.h"
 
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/iterator_sm80.h b/src/turbomind/kernels/attention/iterator_sm80.h
index 9a2f6db415..bc3fce5d40 100644
--- a/src/turbomind/kernels/attention/iterator_sm80.h
+++ b/src/turbomind/kernels/attention/iterator_sm80.h
@@ -3,7 +3,7 @@
 #pragma once
 
 #include "iterator.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include <cassert>
 #include <type_traits>
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 984a287ff8..1edb92f374 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -1,12 +1,12 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "array_ops.h"
-#include "block.h"
-#include "kv_cache_utils_v2.h"
-#include "quantization.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/attention/block.h"
+#include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
+#include "src/turbomind/kernels/attention/quantization.h"
+#include "src/turbomind/kernels/attention/rotary_embedding.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-#include "thread_map.h"
 #include <type_traits>
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.h b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
index eced5d64ea..74ba7fafb0 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.h
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "attention_params.h"
+#include "src/turbomind/kernels/attention/attention_params.h"
 #include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/attention/mainloop_sm80.h b/src/turbomind/kernels/attention/mainloop_sm80.h
index fbcd1aa6cd..bf0fc1d32a 100644
--- a/src/turbomind/kernels/attention/mainloop_sm80.h
+++ b/src/turbomind/kernels/attention/mainloop_sm80.h
@@ -4,7 +4,7 @@
 
 #include "iterator_sm80.h"
 #include "mainloop.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/pipe_iter.h"
 #include <cuda_pipeline_primitives.h>
 #include <type_traits>
 
diff --git a/src/turbomind/kernels/attention/quantization.h b/src/turbomind/kernels/attention/quantization.h
index e50e0c855a..02f49d0089 100644
--- a/src/turbomind/kernels/attention/quantization.h
+++ b/src/turbomind/kernels/attention/quantization.h
@@ -1,8 +1,7 @@
 #pragma once
 
-#include "src/turbomind/kernels/attention/array_ops.h"
-#include "src/turbomind/kernels/attention/data_type.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/data_type.h"
 
 #include <cmath>
 #include <cuda_bf16.h>
@@ -376,6 +375,45 @@ quantize(Array<Q, N> (&dst)[S][C], const Array<T, N> (&src)[S][C], const Array<P
     }
 }
 
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+// generic case for floating point -> floating point / integer -> integer conversion
+template<typename Ti, typename To, typename = void>
+struct ConvertKvCache {
+    __device__ __host__ ConvertKvCache(float, float) {}
+    template<int N>
+    __device__ static auto convert(const Array<Ti, N>& vi)
+    {
+        Array<To, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            vo[i] = (To)vi[i];
+        }
+        return vo;
+    }
+    template<int N>
+    inline __device__ auto operator()(const Array<Ti, N>& vi) const -> Array<To, N>
+    {
+        return convert(vi);
+    }
+};
+
+// generic case for converting to same type, bypass
+template<typename T>
+struct ConvertKvCache<T, T> {
+    __device__ __host__ ConvertKvCache(float, float) {}
+    template<int N>
+    __device__ static auto convert(const Array<T, N>& v)
+    {
+        return v;
+    }
+    template<int N>
+    inline __device__ auto operator()(const Array<T, N>& v) const -> Array<T, N>
+    {
+        return convert(v);
+    }
+};
+
 //  floating point -> u8
 template<class T>
 struct ConvertKvCache<T, uint8_t> {
@@ -441,7 +479,6 @@ struct ConvertKvCache<T, uint4_t> {
         return vo;
     }
 };
-
 template<>
 struct ConvertKvCache<uint4_t, half> {
 
@@ -464,7 +501,8 @@ struct ConvertKvCache<uint4_t, half> {
         static constexpr uint32_t TOP_MASK    = 0x00f000f0;
         static constexpr uint32_t MAGIC_NUM_0 = 0x64006400;  // `1024`
         static constexpr uint32_t MAGIC_NUM_1 = 0x54005400;  // `64`
-        const uint32_t            top_i4s     = i4s >> 8;
+        // const uint32_t            top_i4s     = i4s >> 8;
+        uint32_t top_i4s = __byte_perm(i4s, 0, 0x4321);
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
@@ -487,6 +525,7 @@ struct ConvertKvCache<uint4_t, half> {
         static constexpr uint32_t MAGIC_NUM_1 = 0x54005400;        // `64`
         static constexpr uint32_t MAGIC_NUM_2 = MAGIC_NUM_1 >> 4;  // `64` >> 4
         const uint32_t            top_i4s     = i4s >> 8;
+        // uint32_t top_i4s = __byte_perm(i4s, 0, 0x4321);
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
         asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
@@ -655,6 +694,56 @@ struct ConvertKvCache<uint8_t, T> {
     }
 };
 
+inline __device__ Array<nv_bfloat16, 4> cvt_bf16x4_e4m3(const Array<fp8_e4m3, 4>& v)
+{
+#if TURBOMIND_ARCH_SM80
+    static constexpr uint32_t EM_MASK = 0x7f007f00;
+    static constexpr uint32_t S_MASK  = 0x80008000;
+
+    Array<nv_bfloat16, 4> result;
+    uint32_t*             h = reinterpret_cast<uint32_t*>(&result);
+
+    const uint32_t& i2s_0 = reinterpret_cast<const uint32_t&>(v);
+    const uint32_t  i2s_1 = i2s_0 << 8;
+
+    /// TODO: Check LOP3 is generated for (a | (b & c))
+    h[0] = ((i2s_0 & EM_MASK) >> 4) | (i2s_0 & S_MASK);
+    h[1] = ((i2s_1 & EM_MASK) >> 4) | (i2s_1 & S_MASK);
+
+    // SEEEEEEE EMMMMMMM
+    //  1111011 1         // 2^(127-7)  0x7b80
+
+    /// TODO: fuse this with per channel scaling
+    const nv_bfloat16 exp_shfit = __ushort_as_bfloat16(0x7b80);  // 2^120
+    PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+        result[i] *= exp_shfit;
+    }
+    return result;
+#else
+    return {};
+#endif
+};
+
+template<class T>
+struct ConvertKvCache<fp8_e4m3, T> {
+    template<int N>
+    __device__ static auto convert(const Array<fp8_e4m3, N>& vi)
+    {
+        Array<T, N> vo;
+        PRAGMA_UNROLL
+        for (int n = 0; n < N; n += 4) {
+            auto& ui = (const Array<fp8_e4m3, 4>&)vi[n];
+            if constexpr (std::is_same_v<T, nv_bfloat16>) {
+                return cvt_bf16x4_e4m3(ui);
+            }
+            else {
+                static_assert(!std::is_same_v<T, T>, "not implemented");
+            }
+        }
+    }
+};
+
 template<class Q, class T>
 inline __device__ void StoreQuantParam(T* dst, Array<T, 2> src)
 {
diff --git a/src/turbomind/kernels/attention/reduce.h b/src/turbomind/kernels/attention/reduce.h
index 2c37a21093..c078de5958 100644
--- a/src/turbomind/kernels/attention/reduce.h
+++ b/src/turbomind/kernels/attention/reduce.h
@@ -2,7 +2,12 @@
 
 #pragma once
 
+#include "cta_map.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/thread_map.h"
+#include <cstddef>
 #include <cuda_runtime.h>
+#include <type_traits>
 
 namespace turbomind::attention {
 
diff --git a/src/turbomind/kernels/attention/reduce_kernel.h b/src/turbomind/kernels/attention/reduce_kernel.h
index 1ad369ddd5..88a3ab3af8 100644
--- a/src/turbomind/kernels/attention/reduce_kernel.h
+++ b/src/turbomind/kernels/attention/reduce_kernel.h
@@ -1,10 +1,8 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/attention/array_ops.h"
 #include "src/turbomind/kernels/attention/cta_map.h"
-#include "src/turbomind/kernels/attention/thread_map.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
-#include <cstddef>
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/thread_map.h"
 #include <type_traits>
 
 namespace turbomind::attention {
diff --git a/src/turbomind/kernels/attention/reference.cu b/src/turbomind/kernels/attention/reference.cu
index c0cc23fd1b..d2ea34ccaa 100644
--- a/src/turbomind/kernels/attention/reference.cu
+++ b/src/turbomind/kernels/attention/reference.cu
@@ -1,7 +1,8 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "array_ops.h"
 #include "reference.h"
+#include "src/turbomind/kernels/attention/rotary_embedding.h"
+#include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
new file mode 100644
index 0000000000..8bc54ad268
--- /dev/null
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -0,0 +1,188 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+
+namespace turbomind {
+
+template<int N>
+struct RotaryEmbedding {
+
+    static_assert(N % 2 == 0);
+
+    Array<float, N> cs_;
+
+    bool is_valid_;
+
+    __device__ RotaryEmbedding(float base, int dims, int timestep, int2 offset)
+    {
+        const int idx = offset.x;
+        is_valid_     = idx < dims;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            const float2 tmp = get_coefficient(idx + i, dims, base, timestep);
+            cs_[i]           = tmp.x;
+            cs_[i + 1]       = tmp.y;
+        }
+    }
+
+    // ! depending on the context, this function may generate different result when inlined
+    static __device__ __noinline__ float2 get_coefficient(int idx, int dims, float base, int timestep)
+    {
+        const float inv_freq = timestep / powf(base, idx / (float)dims);
+        float2      cs;
+        sincosf(inv_freq, &cs.y, &cs.x);
+        return cs;
+    }
+
+    template<typename T>
+    __device__ void apply(Array<T, N>& x)
+    {
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float tmp0 = cs_[i] * (float)x[i] - cs_[i + 1] * (float)x[i + 1];
+            float tmp1 = cs_[i] * (float)x[i + 1] + cs_[i + 1] * (float)x[i];
+            if (is_valid_) {
+                x[i]     = (T)tmp0;
+                x[i + 1] = (T)tmp1;
+            }
+        }
+    }
+};
+template<class C, class T>
+__device__ void ApplyRotaryEmbedding(Array<T, 4>& x, float base, int dims, int ti, int di)
+{
+    PRAGMA_UNROLL
+    for (int d1 = 0; d1 < 2; ++d1) {
+        int    d        = d1 * 8 + di;
+        float  inv_freq = ti / powf(base, d / (float)dims);
+        float2 cs;
+        sincosf(inv_freq, &cs.y, &cs.x);
+        C x1          = (C)cs.x * (C)x[d1 * 2 + 0] - (C)cs.y * (C)x[d1 * 2 + 1];
+        C x2          = (C)cs.x * (C)x[d1 * 2 + 1] + (C)cs.y * (C)x[d1 * 2 + 0];
+        x[d1 * 2 + 0] = (T)x1;
+        x[d1 * 2 + 1] = (T)x2;
+    }
+}
+
+template<class D, int N>
+struct FastRoPE {
+
+    static_assert(N % 2 == 0);
+
+    Array<float, N / 2> inv_freq_;
+    bool                is_valid_;
+
+    __device__ FastRoPE(int   idx,
+                        D     dims,
+                        float base,
+                        float ti_scale,
+                        float llama3_inv_scaling_factor,
+                        float llama3_alpha,
+                        float llama3_beta,
+                        std::integral_constant<int, N>)
+    {
+        is_valid_ = idx < dims;
+        /// TODO: Take this away from device code
+        const float scale_factor = -log2f(base) / dims;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            inv_freq_[i / 2] = ti_scale * exp2f((idx + i) * scale_factor);
+        }
+        // clang-format off
+        /* The [llama3 rope](https://github.com/huggingface/transformers/blob/5f4ee98a7ade33e1c54fdd6181d04ee7b426b392/src/transformers/modeling_rope_utils.py#L298)
+         * used by llama3.1 equals to the following equation, given the precommuted parameters as:
+        ```C++
+        inv_scaling_factor = 1 / factor;
+        inv_diff_freq_factor = 1 / (high_freq_factor - low_freq_factor);
+        alpha = old_context_len / (2 * PI) * inv_diff_freq_factor;
+        beta = low_freq_factor * inv_diff_freq_factor
+        ```
+        */
+        // clang-format on
+        if (llama3_inv_scaling_factor) {
+            PRAGMA_UNROLL
+            for (int i = 0; i < N; i += 2) {
+                auto freq        = inv_freq_[i / 2];
+                auto smooth      = fmaxf(0.f, fminf(1.f, llama3_alpha * freq - llama3_beta));
+                inv_freq_[i / 2] = (1 - smooth) * freq * llama3_inv_scaling_factor + smooth * freq;
+            }
+        }
+    }
+
+    template<typename T>
+    __device__ void apply(Array<T, N>& x, float timestep)
+    {
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            float c, s;
+            sincosf(timestep * inv_freq_[i / 2], &s, &c);
+            float tmp0 = c * (float)x[i] - s * (float)x[i + 1];
+            float tmp1 = c * (float)x[i + 1] + s * (float)x[i];
+            if (is_valid_) {
+                x[i]     = (T)tmp0;
+                x[i + 1] = (T)tmp1;
+            }
+        }
+    }
+};
+
+template<int N, int C = 8>
+struct RoPE {
+    Array<float, N> inv_freqs_;
+
+    RoPE() = default;
+    __device__ RoPE(float idx, float base, float dims)
+    {
+        for (int i = 0; i < N; ++i) {
+            inv_freqs_[i] = powf(base, idx / dims + (C / dims) * i);
+        }
+    }
+
+    template<class T>
+    __device__ void apply(Array<T, N * 2>& x, float timestep)
+    {
+        for (int i = 0; i < N; ++i) {
+            const float inv_freq = timestep * inv_freqs_[i];
+            float2      cs;
+            sincosf(inv_freq, &cs.y, &cs.x);
+            float tmp0   = cs.x * (float)x[i * 2] - cs.y * (float)x[i * 2 + 1];
+            float tmp1   = cs.x * (float)x[i * 2 + 1] + cs.y * (float)x[i * 2];
+            x[i * 2]     = (T)tmp0;
+            x[i * 2 + 1] = (T)tmp1;
+        }
+    }
+};
+
+struct LogNScaling {
+
+    float scale_;
+
+    __device__ static float get_scale(int seq_len, int max_position_embeddings)
+    {
+        if (seq_len <= max_position_embeddings) {
+            return 1.f;
+        }
+        else {
+            return log2f(seq_len) / log2f(max_position_embeddings);
+        }
+    }
+
+    __device__ LogNScaling(int seq_len, int max_position_embeddings)
+    {
+        scale_ = get_scale(seq_len, max_position_embeddings);
+    }
+
+    template<typename T, int N>
+    __device__ void apply(Array<T, N>& x) const
+    {
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            x[i] = (T)((float)x[i] * scale_);
+        }
+    }
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/test_quant.cu b/src/turbomind/kernels/attention/test_quant.cu
index 15979d2af6..1248d5fbac 100644
--- a/src/turbomind/kernels/attention/test_quant.cu
+++ b/src/turbomind/kernels/attention/test_quant.cu
@@ -1,9 +1,8 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include "quantization.h"
-#include "src/turbomind/kernels/attention/array_ops.h"
 #include "src/turbomind/kernels/attention/test_utils.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/macro.h"
 #include <cstdint>
 #include <iostream>
diff --git a/src/turbomind/kernels/core/array.h b/src/turbomind/kernels/core/array.h
new file mode 100644
index 0000000000..85186ff94a
--- /dev/null
+++ b/src/turbomind/kernels/core/array.h
@@ -0,0 +1,143 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/sub_byte_ptr.h"
+
+namespace turbomind {
+
+template<typename T, int N>
+struct Array {
+    using value_type      = T;
+    using size_type       = int;
+    using difference_type = int;
+    using reference       = value_type&;
+    using const_reference = const value_type&;
+    using pointer         = value_type*;
+    using const_pointer   = const value_type*;
+    using iterator        = pointer;
+    using const_iterator  = const_pointer;
+
+    static_assert(N > 0);
+
+    T __a[N];
+
+    TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept
+    {
+        return __a[i];
+    }
+
+    TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept
+    {
+        return __a[i];
+    }
+
+    TM_HOST_DEVICE constexpr reference front() noexcept
+    {
+        return *begin();
+    }
+
+    TM_HOST_DEVICE constexpr const_reference front() const noexcept
+    {
+        return *begin();
+    }
+
+    TM_HOST_DEVICE constexpr reference back() noexcept
+    {
+        return *(end() - 1);
+    }
+
+    TM_HOST_DEVICE constexpr const_reference back() const noexcept
+    {
+        return *(end() - 1);
+    }
+
+    TM_HOST_DEVICE constexpr pointer data() noexcept
+    {
+        return &__a[0];
+    }
+
+    TM_HOST_DEVICE constexpr const_pointer data() const noexcept
+    {
+        return &__a[0];
+    }
+
+    TM_HOST_DEVICE constexpr iterator begin() noexcept
+    {
+        return data();
+    }
+
+    TM_HOST_DEVICE constexpr const_iterator begin() const noexcept
+    {
+        return data();
+    }
+
+    TM_HOST_DEVICE constexpr iterator end() noexcept
+    {
+        return data() + N;
+    }
+
+    TM_HOST_DEVICE constexpr const_iterator end() const noexcept
+    {
+        return data() + N;
+    }
+
+    TM_HOST_DEVICE static constexpr std::integral_constant<int, N> size() noexcept
+    {
+        return {};
+    }
+
+    TM_HOST_DEVICE static constexpr std::false_type empty() noexcept
+    {
+        return {};
+    }
+};
+
+template<int N>
+struct Array<uint4_t, N> {
+    using value_type      = detail::__uint4_t;
+    using size_type       = int;
+    using difference_type = int;
+    using reference       = value_type&;
+    using const_reference = const value_type&;
+    using pointer         = SubBytePtr<uint4_t>;
+    using const_pointer   = SubBytePtr<const uint4_t>;
+
+    // static_assert(N % 8 == 0);
+
+    detail::__uint4_t __a[N / 8];
+
+    TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept
+    {
+        return __a[i / 8];
+    }
+
+    TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept
+    {
+        return __a[i / 8];
+    }
+
+    TM_HOST_DEVICE static constexpr std::integral_constant<int, N> size() noexcept
+    {
+        return {};
+    }
+
+    TM_HOST_DEVICE static constexpr std::false_type empty() noexcept
+    {
+        return {};
+    }
+
+    TM_HOST_DEVICE constexpr pointer data() noexcept
+    {
+        return {(char*)&__a[0]};
+    }
+};
+
+static_assert(sizeof(Array<uint4_t, 8>) == 4);
+static_assert(sizeof(Array<uint4_t, 16>) == 8);
+static_assert(sizeof(Array<uint4_t, 24>) == 12);
+static_assert(sizeof(Array<uint4_t, 32>) == 16);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/array_ops.h b/src/turbomind/kernels/core/array_ops.h
similarity index 55%
rename from src/turbomind/kernels/attention/array_ops.h
rename to src/turbomind/kernels/core/array_ops.h
index 3c92dfc796..6b639abc83 100644
--- a/src/turbomind/kernels/attention/array_ops.h
+++ b/src/turbomind/kernels/core/array_ops.h
@@ -2,10 +2,9 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include <cfloat>
-#include <limits>
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include <cassert>
 #include <type_traits>
 
 namespace turbomind {
@@ -172,186 +171,6 @@ inline __device__ void copy(const Array<T, N> (&src)[M], Array<T, N> (&dst)[M])
     }
 }
 
-template<int N>
-struct RotaryEmbedding {
-
-    static_assert(N % 2 == 0);
-
-    Array<float, N> cs_;
-
-    bool is_valid_;
-
-    __device__ RotaryEmbedding(float base, int dims, int timestep, int2 offset)
-    {
-        const int idx = offset.x;
-        is_valid_     = idx < dims;
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; i += 2) {
-            const float2 tmp = get_coefficient(idx + i, dims, base, timestep);
-            cs_[i]           = tmp.x;
-            cs_[i + 1]       = tmp.y;
-        }
-    }
-
-    // ! depending on the context, this function may generate different result when inlined
-    static __device__ __noinline__ float2 get_coefficient(int idx, int dims, float base, int timestep)
-    {
-        const float inv_freq = timestep / powf(base, idx / (float)dims);
-        float2      cs;
-        sincosf(inv_freq, &cs.y, &cs.x);
-        return cs;
-    }
-
-    template<typename T>
-    __device__ void apply(Array<T, N>& x)
-    {
-
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; i += 2) {
-            float tmp0 = cs_[i] * (float)x[i] - cs_[i + 1] * (float)x[i + 1];
-            float tmp1 = cs_[i] * (float)x[i + 1] + cs_[i + 1] * (float)x[i];
-            if (is_valid_) {
-                x[i]     = (T)tmp0;
-                x[i + 1] = (T)tmp1;
-            }
-        }
-    }
-};
-
-template<class C, class T>
-__device__ void ApplyRotaryEmbedding(Array<T, 4>& x, float base, int dims, int ti, int di)
-{
-    PRAGMA_UNROLL
-    for (int d1 = 0; d1 < 2; ++d1) {
-        int    d        = d1 * 8 + di;
-        float  inv_freq = ti / powf(base, d / (float)dims);
-        float2 cs;
-        sincosf(inv_freq, &cs.y, &cs.x);
-        C x1          = (C)cs.x * (C)x[d1 * 2 + 0] - (C)cs.y * (C)x[d1 * 2 + 1];
-        C x2          = (C)cs.x * (C)x[d1 * 2 + 1] + (C)cs.y * (C)x[d1 * 2 + 0];
-        x[d1 * 2 + 0] = (T)x1;
-        x[d1 * 2 + 1] = (T)x2;
-    }
-}
-
-template<class D, int N>
-struct FastRoPE {
-
-    static_assert(N % 2 == 0);
-
-    Array<float, N / 2> inv_freq_;
-    bool                is_valid_;
-
-    __device__ FastRoPE(int   idx,
-                        D     dims,
-                        float base,
-                        float ti_scale,
-                        float llama3_inv_scaling_factor,
-                        float llama3_alpha,
-                        float llama3_beta,
-                        std::integral_constant<int, N>)
-    {
-        is_valid_ = idx < dims;
-        /// TODO: Take this away from device code
-        const float scale_factor = -log2f(base) / dims;
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; i += 2) {
-            inv_freq_[i / 2] = ti_scale * exp2f((idx + i) * scale_factor);
-        }
-        // clang-format off
-        /* The [llama3 rope](https://github.com/huggingface/transformers/blob/5f4ee98a7ade33e1c54fdd6181d04ee7b426b392/src/transformers/modeling_rope_utils.py#L298)
-         * used by llama3.1 equals to the following equation, given the precommuted parameters as:
-        ```C++
-        inv_scaling_factor = 1 / factor;
-        inv_diff_freq_factor = 1 / (high_freq_factor - low_freq_factor);
-        alpha = old_context_len / (2 * PI) * inv_diff_freq_factor;
-        beta = low_freq_factor * inv_diff_freq_factor
-        ```
-        */
-        // clang-format on
-        if (llama3_inv_scaling_factor) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < N; i += 2) {
-                auto freq        = inv_freq_[i / 2];
-                auto smooth      = fmaxf(0.f, fminf(1.f, llama3_alpha * freq - llama3_beta));
-                inv_freq_[i / 2] = (1 - smooth) * freq * llama3_inv_scaling_factor + smooth * freq;
-            }
-        }
-    }
-
-    template<typename T>
-    __device__ void apply(Array<T, N>& x, float timestep)
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; i += 2) {
-            float c, s;
-            sincosf(timestep * inv_freq_[i / 2], &s, &c);
-            float tmp0 = c * (float)x[i] - s * (float)x[i + 1];
-            float tmp1 = c * (float)x[i + 1] + s * (float)x[i];
-            if (is_valid_) {
-                x[i]     = (T)tmp0;
-                x[i + 1] = (T)tmp1;
-            }
-        }
-    }
-};
-
-template<int N, int C = 8>
-struct RoPE {
-    Array<float, N> inv_freqs_;
-
-    RoPE() = default;
-    __device__ RoPE(float idx, float base, float dims)
-    {
-        for (int i = 0; i < N; ++i) {
-            inv_freqs_[i] = powf(base, idx / dims + (C / dims) * i);
-        }
-    }
-
-    template<class T>
-    __device__ void apply(Array<T, N * 2>& x, float timestep)
-    {
-        for (int i = 0; i < N; ++i) {
-            const float inv_freq = timestep * inv_freqs_[i];
-            float2      cs;
-            sincosf(inv_freq, &cs.y, &cs.x);
-            float tmp0   = cs.x * (float)x[i * 2] - cs.y * (float)x[i * 2 + 1];
-            float tmp1   = cs.x * (float)x[i * 2 + 1] + cs.y * (float)x[i * 2];
-            x[i * 2]     = (T)tmp0;
-            x[i * 2 + 1] = (T)tmp1;
-        }
-    }
-};
-
-struct LogNScaling {
-
-    float scale_;
-
-    __device__ static float get_scale(int seq_len, int max_position_embeddings)
-    {
-        if (seq_len <= max_position_embeddings) {
-            return 1.f;
-        }
-        else {
-            return log2f(seq_len) / log2f(max_position_embeddings);
-        }
-    }
-
-    __device__ LogNScaling(int seq_len, int max_position_embeddings)
-    {
-        scale_ = get_scale(seq_len, max_position_embeddings);
-    }
-
-    template<typename T, int N>
-    __device__ void apply(Array<T, N>& x) const
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; ++i) {
-            x[i] = (T)((float)x[i] * scale_);
-        }
-    }
-};
-
 template<typename T, int N>
 inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
 {
@@ -364,6 +183,9 @@ inline __device__ void Store(T* __restrict__ dst, const Array<T, N>& src)
     else if constexpr (sizeof(Array<T, N>) == sizeof(uint1)) {
         *(uint1*)dst = (const uint1&)src;
     }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(ushort)) {
+        *(ushort*)dst = (const ushort&)src;
+    }
     else if constexpr (sizeof(Array<T, N>) % sizeof(uint4) == 0) {  //  uncoalesced
         static_assert(bitsof<T> % 8 == 0, "raw pointer arithmetic of sub-byte types");
         constexpr int M = sizeof(Array<T, N>) / sizeof(uint4);
@@ -396,6 +218,25 @@ inline __device__ void Stcs(T* __restrict__ dst, const Array<T, N>& src)
     }
 }
 
+template<typename T, int N>
+inline __device__ void Stcg(T* __restrict__ dst, const Array<T, N>& src)
+{
+    static_assert(sizeof(Array<T, N>) <= sizeof(uint4));
+
+    if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
+        __stcg((uint4*)dst, (const uint4&)src);
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint2)) {
+        __stcg((uint2*)dst, (const uint2&)src);
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint1)) {
+        __stcg((uint*)dst, (const uint&)src);
+    }
+    else {
+        static_assert(!std::is_same_v<T, T>);
+    }
+}
+
 template<typename T, int N>
 inline __device__ void Ldg(Array<T, N>& dst, const T* src)
 {
@@ -469,6 +310,29 @@ inline __device__ void LdShared(Array<T, N>& dst, uint32_t uintptr)
     }
 }
 
+template<typename T, int N>
+inline __device__ void StShared(uint32_t uintptr, Array<T, N>& src)
+{
+    static_assert(sizeof(Array<T, N>) <= sizeof(uint4));
+    if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
+        uint4& p = (uint4&)src;
+        // clang-format off
+        asm volatile("st.shared.v4.b32 [%0], {%1,%2,%3,%4};\n" :: "r"(uintptr), "r"(p.x), "r"(p.y), "r"(p.z), "r"(p.w) );
+        // clang-format on
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint2)) {
+        uint2& p = (uint2&)src;
+        asm volatile("st.shared.v2.b32 [%0], {%1,%2};\n" ::"r"(uintptr), "r"(p.x), "r"(p.y));
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint)) {
+        uint& p = (uint&)src;
+        asm volatile("st.shared.b32  [%0], %1;\n" ::"r"(uintptr), "r"(p));
+    }
+    else {
+        static_assert(!std::is_same_v<T, T>);
+    }
+}
+
 template<int kWarpCount, typename T, int N>
 inline __device__ Array<T, N> blockSum(Array<T, N> val, T* smem_red, int warp_id, int lane_id)
 {
@@ -498,43 +362,73 @@ inline __device__ Array<T, N> blockSum(Array<T, N> val, T* smem_red, int warp_id
     return val;
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////////////
+template<class T, int N>
+__device__ void CpAsync(T* dst, const Array<T, N>* __restrict__ src)
+{
+    const int     smem_int_ptr = cast_smem_ptr_to_uint(dst);
+    constexpr int cp_size      = sizeof(Array<T, N>);
+#if TURBOMIND_ARCH_SM80
+    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" ::"r"(smem_int_ptr), "l"(src), "n"(cp_size));
+#else
+    assert(TURBOMIND_ARCH_SM80);
+#endif
+}
 
-// generic case for floating point -> floating point / integer -> integer conversion
-template<typename Ti, typename To, typename = void>
-struct ConvertKvCache {
-    __device__ __host__ ConvertKvCache(float, float) {}
-    template<int N>
-    __device__ static auto convert(const Array<Ti, N>& vi)
-    {
-        Array<To, N> vo;
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; ++i) {
-            vo[i] = (To)vi[i];
-        }
-        return vo;
-    }
-    template<int N>
-    inline __device__ auto operator()(const Array<Ti, N>& vi) const -> Array<To, N>
-    {
-        return convert(vi);
-    }
-};
+__inline__ __device__ uint transpose_m8n8_b16_warp_shuffle(uint value)
+{
+    const int lane_id  = threadIdx.x % WARP_SIZE;
+    int       src_lane = lane_id / 8 + lane_id % 4 * 8;
+    uint      u0       = __shfl_sync(0xffffffff, value, src_lane);
+    uint      u1       = __shfl_sync(0xffffffff, value, src_lane + 4);
+    short2    r;
 
-// generic case for converting to same type, bypass
-template<typename T>
-struct ConvertKvCache<T, T> {
-    __device__ __host__ ConvertKvCache(float, float) {}
-    template<int N>
-    __device__ static auto convert(const Array<T, N>& v)
-    {
-        return v;
+    if (lane_id % 8 < 4) {
+        r.x = ((short2&)u0).x;
+        r.y = ((short2&)u1).x;
     }
-    template<int N>
-    inline __device__ auto operator()(const Array<T, N>& v) const -> Array<T, N>
-    {
-        return convert(v);
+    else {
+        r.x = ((short2&)u0).y;
+        r.y = ((short2&)u1).y;
     }
-};
+    return (uint&)r;
+}
+
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
+__inline__ __device__ uint transpose_m8n8_b16_movmatrix(uint a)
+{
+#if TURBOMIND_ARCH_SM75
+    uint d;
+    asm volatile("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;\n" : "=r"(d) : "r"(a));
+    return d;
+#else
+    assert(TURBOMIND_ARCH_SM75);
+    return 0;
+#endif
+}
+#endif
+
+__inline__ __device__ uint32_t transpose_m8n8_b16(uint32_t a)
+{
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
+    return transpose_m8n8_b16_movmatrix(a);
+#else
+    return transpose_m8n8_b16_warp_shuffle(a);
+#endif
+}
+
+__inline__ __device__ Array<uint32_t, 2> transpose_m8n8_b32(const Array<uint32_t, 2>& x)
+{
+    uint32_t lo = __byte_perm(x[0], x[1], 0x5410);
+    uint32_t hi = __byte_perm(x[0], x[1], 0x7632);
+
+    lo = transpose_m8n8_b16(lo);
+    hi = transpose_m8n8_b16(hi);
+
+    Array<uint32_t, 2> y;
+    y[0] = __byte_perm(lo, hi, 0x5410);
+    y[1] = __byte_perm(lo, hi, 0x7632);
+
+    return y;
+}
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/core/common.h b/src/turbomind/kernels/core/common.h
new file mode 100644
index 0000000000..6d524c2836
--- /dev/null
+++ b/src/turbomind/kernels/core/common.h
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#define TURBOMIND_ARCH_SM70 1
+#else
+#define TURBOMIND_ARCH_SM70 0
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#define TURBOMIND_ARCH_SM75 1
+#else
+#define TURBOMIND_ARCH_SM75 0
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define TURBOMIND_ARCH_SM80 1
+#else
+#define TURBOMIND_ARCH_SM80 0
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#define TURBOMIND_ARCH_SM90 1
+#else
+#define TURBOMIND_ARCH_SM90 0
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
+#if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+#define PRAGMA_UNROLL _Pragma("unroll")
+#define PRAGMA_NO_UNROLL _Pragma("unroll 1")
+#else
+#define PRAGMA_UNROLL #pragma unroll
+#define PRAGMA_NO_UNROLL #pragma unroll 1
+#endif
+#else
+#define PRAGMA_UNROLL
+#define PRAGMA_NO_UNROLL
+#endif
+
+#if defined(__CUDACC__)
+#define TM_HOST_DEVICE __forceinline__ __host__ __device__
+#define TM_DEVICE __forceinline__ __device__
+#define TM_HOST __forceinline__ __host__
+#else
+#define TM_HOST_DEVICE inline
+#define TM_DEVICE inline
+#define TM_HOST inline
+#endif
+
+constexpr int WARP_SIZE = 32;
+
+#ifndef uint
+using uint = unsigned int;
+#endif
+
+#ifndef ushort
+using ushort = unsigned short int;
+#endif
diff --git a/src/turbomind/kernels/attention/data_type.h b/src/turbomind/kernels/core/data_type.h
similarity index 59%
rename from src/turbomind/kernels/attention/data_type.h
rename to src/turbomind/kernels/core/data_type.h
index 1244bd3f92..f57d1a2714 100644
--- a/src/turbomind/kernels/attention/data_type.h
+++ b/src/turbomind/kernels/core/data_type.h
@@ -1,12 +1,21 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#pragma once
+
 #include <cstdint>
 #include <type_traits>
 
-#pragma once
+#include <cuda_fp16.h>
+#if ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
 
 namespace turbomind {
 
+struct uint1_t {
+};
+struct uint2_t {
+};
 struct uint3_t {
 };
 struct uint4_t {
@@ -21,21 +30,40 @@ struct bitsof_t: std::integral_constant<int, sizeof(T) * 8> {
 };
 
 template<>
-struct bitsof_t<uint3_t>: std::integral_constant<int, 3> {
+struct bitsof_t<uint1_t>: std::integral_constant<int, 1> {
+};
+
+template<>
+struct bitsof_t<uint2_t>: std::integral_constant<int, 2> {
 };
+
+template<>
+struct bitsof_t<uint3_t>: std::integral_constant<int, 3> {
+};  // 2 + 1
+
 template<>
 struct bitsof_t<uint4_t>: std::integral_constant<int, 4> {
 };
+
 template<>
 struct bitsof_t<uint5_t>: std::integral_constant<int, 5> {
-};
+};  // 4 + 1
+
 template<>
 struct bitsof_t<uint6_t>: std::integral_constant<int, 6> {
-};
+};  // 4 + 2
 
 template<class T>
 inline constexpr bitsof_t<T> bitsof{};
 
+struct fp8 {
+    char v;
+};
+struct fp8_e4m3: fp8 {
+};
+struct fp8_e5m2: fp8 {
+};
+
 namespace detail {
 
 struct __uint4_t {
@@ -44,42 +72,11 @@ struct __uint4_t {
 
 }  // namespace detail
 
-template<class T>
-struct SubBytePtr {
-
-    __device__ T& operator[](int i)
-    {
-        return *reinterpret_cast<T*>(ptr_ + i * bitsof<T> / bitsof<char>);
-    }
-
-    friend __device__ SubBytePtr operator+(const SubBytePtr a, int n)
-    {
-        return SubBytePtr{a.ptr_ + n * bitsof<T> / bitsof<char>};
-    }
-
-    friend __device__ SubBytePtr operator+(int n, const SubBytePtr a)
-    {
-        return a + n;
-    }
-
-    __device__ explicit operator T*() const
-    {
-        return (T*)ptr_;
-    }
-
-    char* ptr_;
-};
-
 template<class T, class SFINAE = void>
 struct get_pointer_type_t {
     using type = T*;
 };
 
-template<class T>
-struct get_pointer_type_t<T, std::enable_if_t<bitsof<T> % 8 != 0>> {
-    using type = SubBytePtr<T>;
-};
-
 template<class T>
 using get_pointer_type = typename get_pointer_type_t<T>::type;
 
diff --git a/src/turbomind/kernels/attention/smem_layout.h b/src/turbomind/kernels/core/layout.h
similarity index 80%
rename from src/turbomind/kernels/attention/smem_layout.h
rename to src/turbomind/kernels/core/layout.h
index afde81233b..e0679e5229 100644
--- a/src/turbomind/kernels/attention/smem_layout.h
+++ b/src/turbomind/kernels/core/layout.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/attention/data_type.h"
+#include "src/turbomind/kernels/core/data_type.h"
 namespace turbomind {
 
 template<int Bits, int Base, int Shift>
@@ -46,7 +46,7 @@ struct Identity {
     }
 };
 
-template<int S_, int C_, int S0_, int C0_, class Swizzle_>
+template<int S_, int C_, int S0_ = -1, int C0_ = -1, class Swizzle_ = Identity>
 struct SmemLayoutV2 {
 
     // (C0,S0),(   C1,       S1)
@@ -55,8 +55,8 @@ struct SmemLayoutV2 {
     static constexpr int S = S_;
     static constexpr int C = C_;
 
-    static constexpr int S0 = S0_;
-    static constexpr int C0 = C0_;
+    static constexpr int S0 = S0_ < 0 ? S : S0_;
+    static constexpr int C0 = C0_ < 0 ? C : C0_;
 
     static_assert(S % S0 == 0);
     static_assert(C % C0 == 0);
@@ -71,6 +71,8 @@ struct SmemLayoutV2 {
 
     using Swizzle = Swizzle_;
 
+    static constexpr int kIsTrivial = S == S0 && C == C0 && std::is_same_v<Swizzle, Identity>;
+
     __forceinline__ __device__ static int apply(int s, int c, int offset = 0)
     {
         int s1 = s / S0;
@@ -122,11 +124,25 @@ struct SmemAccessor {
         return ptr_[layout_(s, c, offset)];
     }
 
-    // __device__ T& operator()(int s, int c, int offset)
-    // {
-    //     // return *((T*)((char*)ptr_ + offset) + layout_(s, c));
-    //     return *(T*)((char*)(ptr_ + layout_(s, c)) + offset);
-    // }
+    __device__ T& operator()(int idx)
+    {
+        return ptr_[idx];
+    }
+};
+
+template<class T0, class T1>
+struct Stride {
+    T0 v0;
+    T1 v1;
+
+    // CTAD
+    __host__ __device__ Stride(T0 v0, T1 v1): v0{v0}, v1{v1} {}
+
+    template<class I0, class I1>
+    __host__ __device__ constexpr auto operator()(I0 i0, I1 i1) const
+    {
+        return v0 * i0 + v1 * i1;
+    }
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/core/math.h b/src/turbomind/kernels/core/math.h
new file mode 100644
index 0000000000..e71d04c39b
--- /dev/null
+++ b/src/turbomind/kernels/core/math.h
@@ -0,0 +1,37 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include <cassert>
+
+namespace turbomind {
+
+template<class T>
+TM_HOST_DEVICE constexpr T ceil_div(T a, T b)
+{
+    return (a + b - 1) / b;
+}
+
+template<class T>
+TM_HOST_DEVICE constexpr T round_up(T a, T b)
+{
+    return (a + b - 1) / b * b;
+}
+
+template<class T>
+TM_HOST_DEVICE constexpr T log2(T x)
+{
+    T n = 0;
+    while (x != 1) {
+        x /= 2;
+        ++n;
+    }
+    return n;
+}
+
+// static_assert(log2(65536) == 16);
+// static_assert(log2(32) == 5);
+// static_assert(log2(1) == 0);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/meta.h b/src/turbomind/kernels/core/meta.h
new file mode 100644
index 0000000000..01b74a06ce
--- /dev/null
+++ b/src/turbomind/kernels/core/meta.h
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind {
+
+template<class T>
+struct basic_type {
+    using type = T;
+};
+
+template<class T>
+constexpr basic_type<T> type_c{};
+
+template<auto v>
+struct constant {
+    using type       = constant;
+    using value_type = decltype(v);
+
+    static constexpr value_type value = v;
+
+    constexpr value_type operator()() const noexcept
+    {
+        return v;
+    }
+    constexpr operator value_type() const noexcept
+    {
+        return v;
+    }
+};
+
+template<auto u, auto v>
+struct pair {
+};
+
+template<auto u, auto v>
+constexpr auto first(pair<u, v>)
+{
+    return u;
+}
+
+template<auto u, auto v>
+constexpr auto second(pair<u, v>)
+{
+    return v;
+}
+
+template<auto u, auto v, auto w>
+struct triplet {
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/mma.h b/src/turbomind/kernels/core/mma.h
new file mode 100644
index 0000000000..768cdcbe1c
--- /dev/null
+++ b/src/turbomind/kernels/core/mma.h
@@ -0,0 +1,211 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include <cassert>
+
+namespace turbomind {
+
+__inline__ __device__ void
+mma_m8n8k4_row_col(Array<float, 8>& d, const Array<half, 4>& a, const Array<half, 4>& b, Array<float, 8>& c)
+{
+#if TURBOMIND_ARCH_SM70
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    // clang-format off
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32"
+        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+        "{%8,  %9},"
+        "{%10, %11},"
+        "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d[0]), "=f"(d[1]), "=f"(d[2]), "=f"(d[3]), "=f"(d[4]), "=f"(d[5]), "=f"(d[6]), "=f"(d[7])
+        : "r"(A[0]), "r"(A[1]),
+          "r"(B[0]), "r"(B[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]));
+// clang-format on
+#endif
+}
+
+__inline__ __device__ void
+mma_m8n8k4_row_row(Array<float, 8>& d, const Array<half, 4>& a, const Array<half, 4>& b, Array<float, 8>& c)
+{
+#if TURBOMIND_ARCH_SM70
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    // clang-format off
+    asm volatile(
+        "mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32"
+        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7},"
+        "{%8,  %9},"
+        "{%10, %11},"
+        "{%12, %13, %14, %15, %16, %17, %18, %19};"
+        : "=f"(d[0]), "=f"(d[1]), "=f"(d[2]), "=f"(d[3]), "=f"(d[4]), "=f"(d[5]), "=f"(d[6]), "=f"(d[7])
+        : "r"(A[0]), "r"(A[1]),
+          "r"(B[0]), "r"(B[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]));
+// clang-format on
+#endif
+}
+
+__inline__ __device__ void
+mma_m16n8k8_row_col(Array<float, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<float, 4>& c)
+{
+#if TURBOMIND_ARCH_SM75
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    float const*    C = reinterpret_cast<float const*>(&c);
+    float*          D = reinterpret_cast<float*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
+                 "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+                 : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void
+mma_m16n8k8_row_col(Array<half, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<half, 4>& c)
+{
+#if TURBOMIND_ARCH_SM75
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
+    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16  {%0,%1}, "
+                 "{%2,%3}, {%4}, {%5,%6};\n"
+                 : "=r"(D[0]), "=r"(D[1])
+                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void mma_m16n8k8_row_col(Array<float, 4>&             d,
+                                               const Array<nv_bfloat16, 4>& a,
+                                               const Array<nv_bfloat16, 2>& b,
+                                               Array<float, 4>&             c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    float const*    C = reinterpret_cast<float const*>(&c);
+    float*          D = reinterpret_cast<float*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32  {%0,%1,%2,%3}, "
+                 "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+                 : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    assert(TURBOMIND_ARCH_SM80);
+#endif
+}
+
+__inline__ __device__ void mma_m16n8k8_row_col(Array<nv_bfloat16, 4>&       d,
+                                               const Array<nv_bfloat16, 4>& a,
+                                               const Array<nv_bfloat16, 2>& b,
+                                               Array<nv_bfloat16, 4>&       c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
+    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.bf16.bf16.bf16.bf16  {%0,%1}, "
+                 "{%2,%3}, {%4}, {%5,%6};\n"
+                 : "=r"(D[0]), "=r"(D[1])
+                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+#else
+    assert(TURBOMIND_ARCH_SM80);
+#endif
+}
+
+__inline__ __device__ void
+mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<float, 4>& c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    float const*    C = reinterpret_cast<float const*>(&c);
+    float*          D = reinterpret_cast<float*>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    const Array<half, 4>* _a = (const Array<half, 4>*)&a;
+    const Array<half, 2>* _b = (const Array<half, 2>*)&b;
+    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
+    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
+#endif
+}
+
+__inline__ __device__ void
+mma_m16n8k16_row_col(Array<half, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<half, 4>& c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
+    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16  {%0,%1}, "
+                 "{%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+                 : "=r"(D[0]), "=r"(D[1])
+                 : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
+#else
+    const Array<half, 4>* _a = (const Array<half, 4>*)&a;
+    const Array<half, 2>* _b = (const Array<half, 2>*)&b;
+    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
+    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
+#endif
+}
+
+__inline__ __device__ void mma_m16n8k16_row_col(Array<float, 4>&             d,
+                                                const Array<nv_bfloat16, 8>& a,
+                                                const Array<nv_bfloat16, 4>& b,
+                                                Array<float, 4>&             c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    float const*    C = reinterpret_cast<float const*>(&c);
+    float*          D = reinterpret_cast<float*>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32  {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    const Array<nv_bfloat16, 4>* _a = (const Array<nv_bfloat16, 4>*)&a;
+    const Array<nv_bfloat16, 2>* _b = (const Array<nv_bfloat16, 2>*)&b;
+    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
+    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
+#endif
+}
+
+__inline__ __device__ void mma_m16n8k16_row_col(Array<nv_bfloat16, 4>&       d,
+                                                const Array<nv_bfloat16, 8>& a,
+                                                const Array<nv_bfloat16, 4>& b,
+                                                Array<nv_bfloat16, 4>&       c)
+{
+#if TURBOMIND_ARCH_SM80
+    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
+    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
+    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
+    asm volatile("mma.sync.aligned.m16n8k16.row.col.bf16.bf16.bf16.bf16  {%0,%1}, "
+                 "{%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+                 : "=r"(D[0]), "=r"(D[1])
+                 : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
+#else
+    const Array<nv_bfloat16, 4>* _a = (const Array<nv_bfloat16, 4>*)&a;
+    const Array<nv_bfloat16, 2>* _b = (const Array<nv_bfloat16, 2>*)&b;
+    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
+    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
+#endif
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/pipe_iter.h b/src/turbomind/kernels/core/pipe_iter.h
new file mode 100644
index 0000000000..0add2e4e20
--- /dev/null
+++ b/src/turbomind/kernels/core/pipe_iter.h
@@ -0,0 +1,25 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind {
+
+template<int Stages, int Step = 1>
+struct PipeIter {
+    static constexpr int kMaxStep = Stages * Step;
+
+    int r = 0;
+    int w = kMaxStep - Step;
+
+    __inline__ __device__ PipeIter& operator++()
+    {
+        w = r;
+        r += Step;
+        if (r == kMaxStep) {
+            r -= kMaxStep;
+        }
+        return *this;
+    }
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/smem.h b/src/turbomind/kernels/core/smem.h
new file mode 100644
index 0000000000..0bb03d3d0c
--- /dev/null
+++ b/src/turbomind/kernels/core/smem.h
@@ -0,0 +1,106 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include <cassert>
+
+namespace turbomind {
+
+__inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr)
+{
+    return (uint32_t)__cvta_generic_to_shared(ptr);
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x4_b16(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldsm_x4_trans(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x2_b16(uint& d0, uint& d1, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(d0), "=r"(d1) : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldsm_x2_trans(uint& d0, uint& d1, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0,%1}, [%2];\n"
+                 : "=r"(d0), "=r"(d1)
+                 : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x1_b16(uint& d0, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldsm_x1_trans(uint& d0, uint32_t smem_int_ptr)
+{
+#if TURBOMIND_ARCH_SM75
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr));
+#else
+    assert(TURBOMIND_ARCH_SM75);
+#endif
+}
+
+__inline__ __device__ void ldsm_x4(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
+{
+    ldmatrix_m8n8_x4_b16(d[0], d[1], d[2], d[3], smem_int_ptr);
+}
+
+__inline__ __device__ void ldsm_x2(Array<uint32_t, 2>& d, uint32_t smem_int_ptr)
+{
+    ldmatrix_m8n8_x2_b16(d[0], d[1], smem_int_ptr);
+}
+
+__inline__ __device__ void ldsm_x1(Array<uint32_t, 1>& d, uint32_t smem_int_ptr)
+{
+    ldmatrix_m8n8_x1_b16(d[0], smem_int_ptr);
+}
+
+__inline__ __device__ void ldsm_x4_trans(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
+{
+    ldsm_x4_trans(d[0], d[1], d[2], d[3], smem_int_ptr);
+}
+
+__inline__ __device__ void ldsm_x2_trans(Array<uint32_t, 2>& d, uint32_t smem_int_ptr)
+{
+    ldsm_x2_trans(d[0], d[1], smem_int_ptr);
+}
+
+__inline__ __device__ void ldsm_x1_trans(Array<uint32_t, 1>& d, uint32_t smem_int_ptr)
+{
+    ldsm_x1_trans(d[0], smem_int_ptr);
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/sub_byte_ptr.h b/src/turbomind/kernels/core/sub_byte_ptr.h
new file mode 100644
index 0000000000..da2e6c525a
--- /dev/null
+++ b/src/turbomind/kernels/core/sub_byte_ptr.h
@@ -0,0 +1,51 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/data_type.h"
+
+namespace turbomind {
+
+template<class T>
+struct SubBytePtr {
+
+    constexpr SubBytePtr() = default;
+
+    constexpr __host__ __device__ explicit SubBytePtr(T* ptr): ptr_((char*)ptr) {}
+
+    constexpr __host__ __device__ SubBytePtr(char* ptr): ptr_(ptr) {}
+
+    __device__ T& operator[](int i)
+    {
+        return *reinterpret_cast<T*>(ptr_ + i * bitsof<T> / bitsof<char>);
+    }
+
+    friend __device__ SubBytePtr operator+(const SubBytePtr a, int n)
+    {
+        return SubBytePtr{a.ptr_ + n * bitsof<T> / bitsof<char>};
+    }
+
+    friend __device__ SubBytePtr operator+(int n, const SubBytePtr a)
+    {
+        return a + n;
+    }
+
+    friend __device__ bool operator==(const SubBytePtr& a, const SubBytePtr& b)
+    {
+        return a.ptr_ == b.ptr_;
+    }
+
+    __device__ explicit operator T*() const
+    {
+        return (T*)ptr_;
+    }
+
+    char* ptr_;
+};
+
+template<class T>
+struct get_pointer_type_t<T, std::enable_if_t<bitsof<T> % 8 != 0>> {
+    using type = SubBytePtr<T>;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/core/sync.h b/src/turbomind/kernels/core/sync.h
new file mode 100644
index 0000000000..2f729ce72d
--- /dev/null
+++ b/src/turbomind/kernels/core/sync.h
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind {
+
+__inline__ __device__ int sem_fetch(int* lock, bool pred)
+{
+    int state{};
+    if (pred) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+#else
+        asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+#endif
+    }
+    return state;
+}
+
+__inline__ __device__ void sem_wait(int* lock, int status, bool pred)
+{
+    int state = 0;
+    while (__syncthreads_and(state != status)) {
+        state = sem_fetch(lock, pred);
+    }
+
+    __syncthreads();  // memory fence
+}
+
+__inline__ __device__ void sem_wait_many(int* lock, int count, bool pred)
+{
+    int state = 0;
+    while (__syncthreads_count(state) != count) {
+        state = sem_fetch(lock, pred);
+    }
+
+    __syncthreads();  // memory fence
+}
+
+__inline__ __device__ void sem_post(int* lock, int status, bool pred)
+{
+    __syncthreads();  // memory fence
+
+    if (pred) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#else
+        asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#endif
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/thread_map.h b/src/turbomind/kernels/core/thread_map.h
similarity index 99%
rename from src/turbomind/kernels/attention/thread_map.h
rename to src/turbomind/kernels/core/thread_map.h
index d7ef2ddc7b..66b691832f 100644
--- a/src/turbomind/kernels/attention/thread_map.h
+++ b/src/turbomind/kernels/core/thread_map.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "../gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/common.h"
 
 #include <iostream>
 
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
new file mode 100644
index 0000000000..6fc634dba0
--- /dev/null
+++ b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_library(gemm2
+        gemm.cu
+        kernel.cu
+        registry.cu
+        dispatch_cache.cu
+        gpu_metric.cu
+        convert_v2.cu
+        cast.cu
+        unpack.cu
+        tuner/cache_utils.cu
+        tuner/measurer.cu
+        tuner/sampler.cu
+        tuner/stopping_criterion.cc
+        tuner/params.cc
+        kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
+        kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
+        kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
+        kernel/f16_u4g128_f16_tnt_sm70_s884.cu
+        kernel/f16_u4g128_f16_tnt_sm75_simt.cu
+        kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
+)
+
+target_link_libraries(gemm2 PRIVATE parser)
+
+
+target_compile_options(gemm2 PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:
+                -Xptxas=-v
+                --generate-line-info
+                --threads 8>
+)
+set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+if (BUILD_TEST)
+        add_executable(gemm_test
+                test/gemm_test.cu
+                test/test_utils.cu
+                test/quantization.cu
+                test/reference.cu)
+        target_link_libraries(gemm_test PRIVATE gemm2 cublas)
+
+        if (NOT MSVC)
+                FetchContent_Declare(
+                repo-nvbench
+                GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git
+                GIT_TAG        d8dced8a64d9ce305add92fa6d274fd49b569b7e
+                )
+
+                set(NVBench_ENABLE_EXAMPLES OFF)
+                set(BUILD_SHARED_LIBS OFF)
+
+                FetchContent_MakeAvailable(repo-nvbench)
+
+                add_executable(gemm_bench
+                        test/gemm_bench.cu
+                        test/test_utils.cu
+                        test/quantization.cu
+                        test/reference.cu)
+                target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas)
+        endif ()
+endif ()
diff --git a/src/turbomind/kernels/gemm/arch.h b/src/turbomind/kernels/gemm/arch.h
new file mode 100644
index 0000000000..dcb959f2fa
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch.h
@@ -0,0 +1,49 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind::gemm {
+
+// tags for dispatching & conditional codegen
+
+template<int Begin, int End = -1>
+struct Arch {
+    static constexpr bool is_compatible(int arch)
+    {
+        return Begin <= arch && (End == -1 || arch < End);
+    }
+};
+
+struct Sm70: Arch<700, 750> {
+    static constexpr int value = 700;
+};
+
+struct Sm75: Arch<750, 800> {
+    static constexpr int value = 750;
+};
+
+struct Sm80: Arch<800, 900> {
+    static constexpr int value = 800;
+};
+
+struct Sm90: Arch<900> {
+    static constexpr int value = 900;
+};
+
+inline bool is_arch_compatible(int karch, int darch)
+{
+    switch (karch) {
+        case 700:
+            return Sm70::is_compatible(darch);
+        case 750:
+            return Sm75::is_compatible(darch);
+        case 800:
+            return Sm80::is_compatible(darch);
+        case 900:
+            return Sm90::is_compatible(darch);
+        default:
+            return false;
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/config_simt.h b/src/turbomind/kernels/gemm/arch/config_simt.h
new file mode 100644
index 0000000000..5652da6c53
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/config_simt.h
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/arch/mma_simt.h"
+#include "src/turbomind/kernels/gemm/arch/operand_simt.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/gemm_universal.h"
+#include "src/turbomind/kernels/gemm/iterator_sm70.h"
+#include "src/turbomind/kernels/gemm/mainloop_sm70.h"
+#include "src/turbomind/kernels/gemm/thread_group_map.h"
+#include "src/turbomind/kernels/gemm/tiled_mma.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+namespace simt {
+
+template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+struct Sm75_Simt {
+
+    static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
+
+    static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_K = A::SmemCopyAtom::K;
+
+    template<int CTA_M,
+             int CTA_N,
+             int CTA_K,
+             int TG_M,
+             int TG_N,
+             int TG_K,
+             class PolicyA,
+             class PolicyB,
+             int  Stages,
+             bool SplitK,
+             int  GroupSizeU = 1,
+             int  GroupSizeV = 1,
+             int  TILE_C_M_  = -1,
+             int  TILE_C_N_  = -1>
+    struct Type {
+
+        // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom)
+        using MMA_Atom = MMA_SIMT<half>;
+
+        static constexpr int TM = MMA_Atom::M;
+        static constexpr int TN = MMA_Atom::N;
+        static constexpr int TK = MMA_Atom::K;
+
+        using Partition = Blocked<TG_M, TG_N, kColMajor>;
+
+        using MMA_Map = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
+        using MMA     = Tiled_MMA_v2<MMA_Atom, MMA_Map>;
+
+        // using MMA_Map = RakedThreadGroupMap<CTA_M, CTA_N, CTA_K, TM, TN, TK, WARP_CNT_M, WARP_CNT_N, WARP_CNT_K>;
+
+        using Mainloop = MainloopSm70<MMA,
+                                      A,
+                                      IteratorSm70<PolicyA>,
+                                      TransformA,
+                                      U,
+                                      GroupSizeU,
+                                      B,
+                                      IteratorSm70<PolicyB>,
+                                      TransformB,
+                                      V,
+                                      GroupSizeV,
+                                      Stages,
+                                      true>;
+
+        static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
+        static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
+
+        using Epilogue = gemm::Epilogue_<Tc,
+                                         CTA_M,
+                                         CTA_N,
+                                         TILE_C_M,
+                                         TILE_C_N,
+                                         MMA::kThreadCount,
+                                         Rearrange<MMA>,
+                                         Operand_C<float, order_c>,
+                                         SplitK>;
+
+        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
+    };
+};
+
+}  // namespace simt
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/config_sm70_s884.h b/src/turbomind/kernels/gemm/arch/config_sm70_s884.h
new file mode 100644
index 0000000000..b7b239162f
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/config_sm70_s884.h
@@ -0,0 +1,83 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/arch/mma_sm70.h"
+#include "src/turbomind/kernels/gemm/arch/operand_sm70_s884.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/epilogue.h"
+#include "src/turbomind/kernels/gemm/gemm_universal.h"
+#include "src/turbomind/kernels/gemm/iterator_sm70.h"
+#include "src/turbomind/kernels/gemm/mainloop_sm70.h"
+#include "src/turbomind/kernels/gemm/thread_group_map.h"
+#include "src/turbomind/kernels/gemm/tiled_mma.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm::sm70_s884 {
+
+template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+struct Sm70_s884 {
+
+    static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
+
+    static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_K = A::SmemCopyAtom::K;
+
+    template<int CTA_M,
+             int CTA_N,
+             int CTA_K,
+             int TG_M,
+             int TG_N,
+             int TG_K,
+             class PolicyA,
+             class PolicyB,
+             int  Stages,
+             bool SplitK,
+             int  GroupSizeU = 1,
+             int  GroupSizeV = 1,
+             int  TILE_C_M_  = -1,
+             int  TILE_C_N_  = -1>
+    struct Type {
+
+        // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom)
+        using MMA_Atom = SM70_MMA_884;
+
+        using Partition = Blocked<TG_M, TG_N, kColMajor>;
+        using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
+
+        using MMA = Tiled_MMA_v2<MMA_Atom, MMA_Map>;
+
+        using Mainloop = MainloopSm70<MMA,
+                                      A,
+                                      IteratorSm70<PolicyA>,
+                                      TransformA,
+                                      U,
+                                      GroupSizeU,
+                                      B,
+                                      IteratorSm70<PolicyB>,
+                                      TransformB,
+                                      V,
+                                      GroupSizeV,
+                                      Stages,
+                                      true>;  // FusePrefetch_
+
+        static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
+        static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
+
+        using Epilogue = gemm::Epilogue_<Tc,
+                                         CTA_M,
+                                         CTA_N,
+                                         TILE_C_M,
+                                         TILE_C_N,
+                                         MMA::kThreadCount,
+                                         Rearrange<MMA>,
+                                         Operand_C<float, order_c>,
+                                         SplitK>;
+
+        using Kernel = GemmUniversal<Sm70, Mainloop, Epilogue, CtaMap>;
+    };
+};
+
+}  // namespace turbomind::gemm::sm70_s884
diff --git a/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h b/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
new file mode 100644
index 0000000000..0dd643d4e8
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h
@@ -0,0 +1,83 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/arch/mma_sm80.h"
+#include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/epilogue.h"
+#include "src/turbomind/kernels/gemm/gemm_universal.h"
+#include "src/turbomind/kernels/gemm/iterator_sm70.h"
+#include "src/turbomind/kernels/gemm/mainloop_sm70.h"
+#include "src/turbomind/kernels/gemm/thread_group_map.h"
+#include "src/turbomind/kernels/gemm/tiled_mma.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+namespace sm75_s16816 {
+
+using namespace sm80_s16816;
+
+template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
+struct Sm75_s16816 {
+
+    static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
+
+    static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_K = A::SmemCopyAtom::K;
+
+    template<int CTA_M,
+             int CTA_N,
+             int CTA_K,
+             int TG_M,
+             int TG_N,
+             int TG_K,
+             class PolicyA,
+             class PolicyB,
+             int  Stages,
+             bool SplitK,
+             int  GroupSizeU = 1,
+             int  GroupSizeV = 1,
+             int  TILE_C_M_  = -1,
+             int  TILE_C_N_  = -1>
+    struct Type {
+        // Raked partition dont support `Pack_M > 1`
+        using Partition = Blocked<TG_M, TG_N, kColMajor>;
+        using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
+        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
+
+        using Mainloop = MainloopSm70<MMA,
+                                      A,
+                                      IteratorSm70<PolicyA>,
+                                      TransformA,
+                                      U,
+                                      GroupSizeU,
+                                      B,
+                                      IteratorSm70<PolicyB>,
+                                      TransformB,
+                                      V,
+                                      GroupSizeV,
+                                      Stages,
+                                      true>;  // FusePrefetch_
+
+        static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
+        static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
+
+        using Epilogue = gemm::Epilogue_<Tc,
+                                         CTA_M,
+                                         CTA_N,
+                                         TILE_C_M,
+                                         TILE_C_N,
+                                         MMA::kThreadCount,
+                                         Rearrange<MMA>,
+                                         Operand_C<float, order_c>,
+                                         SplitK>;
+
+        using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
+    };
+};
+
+}  // namespace sm75_s16816
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h b/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
new file mode 100644
index 0000000000..c12f556b01
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h
@@ -0,0 +1,91 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/arch/mma_sm80.h"
+#include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/epilogue.h"
+#include "src/turbomind/kernels/gemm/gemm_universal.h"
+#include "src/turbomind/kernels/gemm/iterator_sm80.h"
+#include "src/turbomind/kernels/gemm/mainloop_sm80_v2.h"
+#include "src/turbomind/kernels/gemm/thread_group_map.h"
+#include "src/turbomind/kernels/gemm/tiled_mma.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm::sm80_s16816 {
+
+template<class Arch,
+         class A,
+         class TransformA,
+         class U,
+         class B,
+         class TransformB,
+         class V,
+         Order order_c,
+         class Tc,
+         class CtaMap_ = CtaMap>
+struct Sm80_s16816 {
+
+    static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
+
+    static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
+    static constexpr int SMEM_K = A::SmemCopyAtom::K;
+
+    template<int CTA_M,
+             int CTA_N,
+             int CTA_K,
+             int TG_M,
+             int TG_N,
+             int TG_K,
+             class PolicyA,
+             class PolicyB,
+             int  Stages,
+             bool SplitK,
+             int  GroupSizeU   = 1,
+             int  GroupSizeV   = 1,
+             int  TILE_C_M_    = -1,
+             int  TILE_C_N_    = -1,
+             bool FusePrefecth = true>
+
+    struct Type {
+
+        // Raked partition dont support `Pack_M > 1`
+        using Partition = Blocked<TG_M, TG_N, kColMajor>;
+        using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
+        using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
+
+        using Mainloop = MainloopSm80_v2<MMA,
+                                         A,
+                                         IteratorSm80<PolicyA>,
+                                         TransformA,
+                                         U,
+                                         GroupSizeU,
+                                         B,
+                                         IteratorSm80<PolicyB>,
+                                         TransformB,
+                                         V,
+                                         GroupSizeV,
+                                         Stages,
+                                         FusePrefecth>;
+
+        static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
+        static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
+
+        using Epilogue = gemm::Epilogue_<Tc,
+                                         CTA_M,
+                                         CTA_N,
+                                         TILE_C_M,
+                                         TILE_C_N,
+                                         MMA::kThreadCount,
+                                         Rearrange<MMA>,
+                                         Operand_C<float, order_c>,
+                                         SplitK>;
+
+        using Kernel = GemmUniversal<Arch, Mainloop, Epilogue, CtaMap_>;
+    };
+};
+
+}  // namespace turbomind::gemm::sm80_s16816
diff --git a/src/turbomind/kernels/gemm/arch/mma_simt.h b/src/turbomind/kernels/gemm/arch/mma_simt.h
new file mode 100644
index 0000000000..f426ba4af8
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/mma_simt.h
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/simt.h"
+
+namespace turbomind::gemm {
+
+template<class T>
+struct MMA_SIMT {
+    static constexpr int M = simt::OP_M;
+    static constexpr int N = simt::OP_N;
+    static constexpr int K = simt::OP_K;
+
+    static constexpr int kThreadCount = 32;
+
+    static constexpr auto kOpClass = OpClass::kSIMT;
+
+    using FragA = Array<T, K>;
+    using FragB = Array<T, K>;
+    using FragC = Array<float, 1>;
+
+    using OffsetC = Array<int2, 1>;
+    using FragC_  = FragC[1];
+
+    __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
+    {
+        PRAGMA_UNROLL
+        for (int k = 0; k < K; ++k) {
+            d[0] = c[0] + float(a[k]) * float(b[k]);
+        }
+
+        // PRAGMA_UNROLL
+        // for (int k = 0; k < K; ++k) {
+        //     d[0] = c[0] + float(a[k] * b[k]);
+        // }
+
+        // T acc{};
+        // PRAGMA_UNROLL
+        // for (int k = 0; k < K; ++k) {
+        //     acc += a[k] * b[k];
+        // }
+        // d[0] = c[0] + float(acc);
+    }
+
+    __device__ static constexpr OffsetC static_offset_C()
+    {
+        return {};
+    }
+
+    __device__ static int2 thread_offset_C()  // -> (m,n)
+    {
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        return {lane_id / N, lane_id % N};
+    }
+
+    __device__ static void ReshapeC(const FragC& c, FragC_& c_)
+    {
+        c_[0] = c;
+    }
+
+    __device__ static int get_group_id(int thread_idx)
+    {
+        return thread_idx / WARP_SIZE;
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/mma_sm70.h b/src/turbomind/kernels/gemm/arch/mma_sm70.h
new file mode 100644
index 0000000000..f5a1d4f0cf
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/mma_sm70.h
@@ -0,0 +1,76 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+
+namespace turbomind::gemm {
+
+struct SM70_MMA_884 {
+    // static constexpr int M = 16;
+    // static constexpr int N = 16;
+    static constexpr int M = 8;
+    static constexpr int N = 32;
+    static constexpr int K = 8;
+
+    static constexpr int kThreadCount = 32;
+
+    static constexpr auto kOpClass = OpClass::kMMA_s884;
+
+    using FragA = Array<half, K>;
+    using FragB = Array<half, K>;
+    using FragC = Array<float, 8>;
+
+    using OffsetC = Array<int2, 4>;
+    using FragC_  = Array<float, 2>[4];
+
+    __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
+    {
+        mma_m8n8k4_row_col(d, (const Array<half, 4>&)a[0], (const Array<half, 4>&)b[0], (FragC&)c);
+        if constexpr (K == 8) {
+            mma_m8n8k4_row_col(d, (const Array<half, 4>&)a[4], (const Array<half, 4>&)b[4], (FragC&)d);
+        }
+    }
+
+    __device__ static constexpr OffsetC static_offset_C()
+    {
+        OffsetC r{};
+        PRAGMA_UNROLL
+        for (int n = 0; n < 2; ++n) {
+            PRAGMA_UNROLL
+            for (int m = 0; m < 2; ++m) {
+                r[n * 2 + m] = int2{m * 2, n * 4};
+            }
+        }
+        return r;
+    }
+
+    __device__ static int2 thread_offset_C()  // -> (m,n)
+    {
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        // return {
+        //     (lane_id & 8) * 1 + (lane_id & 1) + lane_id / 16 * 4,
+        //     (lane_id & 4) * 2 + (lane_id & 2),
+        // };
+        return {(lane_id & 1) + (lane_id / 16) * 4,  //
+                (lane_id & 2) + (lane_id & 12) * 2};
+    }
+
+    __device__ static void ReshapeC(const FragC& c, FragC_& c_)
+    {
+        PRAGMA_UNROLL
+        for (int m = 0; m < 4; ++m) {
+            c_[m] = (Array<float, 2>&)c[m * 2];
+        }
+    }
+
+    __device__ static int get_group_id(int thread_idx)
+    {
+        return thread_idx / WARP_SIZE;
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/mma_sm80.h b/src/turbomind/kernels/gemm/arch/mma_sm80.h
new file mode 100644
index 0000000000..c78ba0209a
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/mma_sm80.h
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+
+namespace turbomind::gemm {
+
+struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
+    static constexpr int M = 16;
+    static constexpr int N = 8;
+    static constexpr int K = 16;
+
+    static constexpr int kThreadCount = 32;
+
+    static constexpr auto kOpClass = OpClass::kMMA_s16816;
+
+    using FragA = Array<half, 8>;
+    using FragB = Array<half, 4>;
+    using FragC = Array<float, 4>;
+
+    using OffsetC = Array<int2, 2>;  // (m, n)
+    using FragC_  = Array<float, 2>[2];
+
+    __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
+    {
+        mma_m16n8k16_row_col(d, a, b, (FragC&)c);
+    }
+
+    __device__ static constexpr OffsetC static_offset_C()
+    {
+        return {int2{0, 0}, int2{8, 0}};
+    }
+
+    __device__ static int2 thread_offset_C()  // -> (m,n)
+    {
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        return {lane_id / 4, lane_id % 4 * 2};
+    }
+
+    __device__ static void ReshapeC(const FragC& c, FragC_& c_)
+    {
+        PRAGMA_UNROLL
+        for (int m = 0; m < 2; ++m) {
+            c_[m] = (Array<float, 2>&)c[m * 2];
+        }
+    }
+
+    __device__ static int get_group_id(int thread_idx)
+    {
+        return thread_idx / WARP_SIZE;
+    }
+};
+
+// This is not used yet
+struct SM75_MMA_16x8x8_F32_F16_F16_F32_TN: SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
+    static constexpr int M = 16;
+    static constexpr int N = 8;
+    static constexpr int K = 8;
+
+    using FragA = Array<half, 4>;
+    using FragB = Array<half, 2>;
+    using FragC = Array<float, 4>;
+
+    __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
+    {
+        mma_m16n8k8_row_col(d, a, b, (FragC&)c);
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/operand_simt.h b/src/turbomind/kernels/gemm/arch/operand_simt.h
new file mode 100644
index 0000000000..112debd9ee
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/operand_simt.h
@@ -0,0 +1,175 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/arch/smem_copy_simt.h"
+#include "src/turbomind/kernels/gemm/iterator.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/simt.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+namespace simt {
+
+struct GetSmemLayout {
+    template<int M, int K>
+    static constexpr auto apply(pair<M, K>)
+    {
+        return SmemLayoutV2<M, K>{};
+    }
+};
+
+template<class T, int K>
+struct Operand_A {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kRowMajor;
+
+    using SmemCopyAtom = SmemCopy_MMA_SIMT_A<T, K>;
+
+    using GetSmemLayout = GetSmemLayout;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T, int K>
+struct Operand_B {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kRowMajor;
+
+    using SmemCopyAtom = SmemCopy_MMA_SIMT_B<T, K>;
+
+    using GetSmemLayout = GetSmemLayout;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<Order order>
+struct _GetSmemLayoutC {
+    template<int M, int N>
+    static constexpr auto apply(pair<M, N>)
+    {
+        constexpr auto cs = mk2cs<order>(M, N);
+        return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
+    }
+};
+
+template<Order order>
+struct _GetThreadMapC {
+    template<int M, int N, int THREADS>
+    static constexpr auto apply(pair<M, N>, constant<THREADS>)
+    {
+        constexpr auto cs    = mk2cs<order>(M, N);
+        constexpr int  WARPS = THREADS / WARP_SIZE;
+
+        return ThreadMap_V2<cs.x, cs.y, 4, Raked, WARPS>{};
+    }
+};
+
+template<class T, Order order>
+struct Operand_C {
+    using Dtype = T;
+
+    static constexpr Order kOrder = order;
+
+    using GetSmemLayout = _GetSmemLayoutC<order>;
+    using GetThreadMap  = _GetThreadMapC<order>;
+};
+
+template<class T>
+struct Operand_V {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kColMajor;
+
+    using SmemCopyAtom = SmemCopy_MMA_SIMT_V<T, 1>;
+
+    struct GetSmemLayout {  // m-major
+        template<int M, int K>
+        static constexpr auto apply(pair<M, K>)
+        {
+            return SmemLayoutV2<K, M>{};
+        }
+    };
+
+    using GetGmemIter = GetGmemIter;
+};
+
+struct GetSmemLayout_Pack {
+    template<int M, int K>
+    static constexpr auto apply(pair<M, K>)
+    {
+        return SmemLayoutV2<M, K>{};
+    }
+};
+
+template<class T, int K>
+struct Operand_B_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 1;
+
+    static constexpr Pack  kPack  = HMMA_SIMT | OPERAND_B | Pack_M;
+    static constexpr Order kOrder = kRowMajor;
+
+    using SmemCopyAtom  = SmemCopyAtom_Pack_v3<T, typename Operand_B<T, K>::SmemCopyAtom, kRowMajor, Pack_M>;
+    using GetSmemLayout = GetSmemLayout_Pack;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T>
+struct Operand_V_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 1;
+
+    static constexpr Pack  kPack  = HMMA_SIMT | OPERAND_V | Pack_M;
+    static constexpr Order kOrder = kColMajor;
+
+    using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_SIMT_V<T, OP_K>, kColMajor, Pack_M>;
+
+    struct GetSmemLayout {  // m-major
+        template<int M, int K>
+        static constexpr auto apply(pair<M, K>)
+        {
+            return SmemLayoutV2<K, M>{};
+        }
+    };
+
+    using GetGmemIter = GetGmemIter;
+};
+
+}  // namespace simt
+
+template<class T>
+struct GetOperand<HMMA_SIMT, OPERAND_A, T, kRowMajor, false>: std::true_type {
+    using Operand = simt::Operand_A<T, simt::OP_K>;
+};
+
+template<class T>
+struct GetOperand<HMMA_SIMT, OPERAND_B, T, kRowMajor, false>: std::true_type {
+    using Operand = simt::Operand_B<T, simt::OP_K>;
+};
+
+template<class T>
+struct GetOperand<HMMA_SIMT, OPERAND_V, T, kColMajor, false>: std::true_type {
+    using Operand = simt::Operand_V<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_SIMT, OPERAND_B, T, kRowMajor, true>: std::true_type {
+    using Operand = simt::Operand_B_Pack<T, simt::OP_K>;
+};
+
+template<class T>
+struct GetOperand<HMMA_SIMT, OPERAND_V, T, kColMajor, true>: std::true_type {
+    using Operand = simt::Operand_V_Pack<T>;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/operand_sm70_s884.h b/src/turbomind/kernels/gemm/arch/operand_sm70_s884.h
new file mode 100644
index 0000000000..20203b9537
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/operand_sm70_s884.h
@@ -0,0 +1,169 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/arch/smem_copy_sm70.h"
+#include "src/turbomind/kernels/gemm/iterator.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+namespace sm70_s884 {
+
+template<Order order>
+struct GetSmemLayout {
+    template<int M, int K>
+    static constexpr auto apply(pair<M, K>)
+    {
+        constexpr int2 cs = mk2cs<order>(M, K);
+        return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
+    }
+};
+
+template<class T>
+struct Operand_A {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kRowMajor;
+
+    using SmemCopyAtom = SmemCopy_MMA_884_A<T>;
+
+    using GetSmemLayout = GetSmemLayout<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T>
+struct Operand_B {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kRowMajor;  // (n,k)
+
+    using SmemCopyAtom = SmemCopy_MMA_884_B<T>;
+
+    using GetSmemLayout = GetSmemLayout<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T>
+struct Operand_V {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kColMajor;  // (n,k)
+
+    using SmemCopyAtom = SmemCopy_MMA_884_V<T, 1>;
+
+    struct GetSmemLayout {  // m-major
+        template<int M, int K>
+        static constexpr auto apply(pair<M, K>)
+        {
+            return SmemLayoutV2<K, M>{};
+        }
+    };
+
+    using GetGmemIter = GetGmemIter;
+};
+
+template<Order order>
+struct _GetSmemLayoutC {
+    template<int M, int N>
+    static constexpr auto apply(pair<M, N>)
+    {
+        constexpr auto cs = mk2cs<order>(M, N);
+        return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
+    }
+};
+
+template<Order order>
+struct _GetThreadMapC {
+    template<int M, int N, int THREADS>
+    static constexpr auto apply(pair<M, N>, constant<THREADS>)
+    {
+        constexpr auto cs    = mk2cs<order>(M, N);
+        constexpr int  WARPS = THREADS / WARP_SIZE;
+
+        return ThreadMap_V2<cs.x, cs.y, 4, Raked, WARPS>{};
+    }
+};
+
+template<class T, Order order>
+struct Operand_C {
+    using Dtype = T;
+
+    static constexpr Order kOrder = order;
+
+    using GetSmemLayout = _GetSmemLayoutC<order>;
+    using GetThreadMap  = _GetThreadMapC<order>;
+};
+
+template<class T>
+struct Operand_B_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 1;
+
+    static constexpr Pack  kPack  = HMMA_884 | OPERAND_B | Pack_M;
+    static constexpr Order kOrder = kRowMajor;
+
+    using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_884_B<T>, kOrder, Pack_M>;
+
+    using GetSmemLayout = GetSmemLayout<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T>
+struct Operand_V_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 1;
+
+    static constexpr Pack  kPack  = HMMA_884 | OPERAND_V | Pack_M;
+    static constexpr Order kOrder = kColMajor;
+
+    using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_884_V<T, 8>, kColMajor, Pack_M>;
+
+    struct GetSmemLayout {  // m-major
+        template<int M, int K>
+        static constexpr auto apply(pair<M, K>)
+        {
+            return SmemLayoutV2<K, M>{};
+        }
+    };
+
+    using GetGmemIter = GetGmemIter;
+};
+
+}  // namespace sm70_s884
+
+template<class T>
+struct GetOperand<HMMA_884, OPERAND_A, T, kRowMajor, false>: std::true_type {
+    using Operand = sm70_s884::Operand_A<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_884, OPERAND_B, T, kRowMajor, false>: std::true_type {
+    using Operand = sm70_s884::Operand_B<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_884, OPERAND_V, T, kColMajor, false>: std::true_type {
+    using Operand = sm70_s884::Operand_V<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_884, OPERAND_B, T, kRowMajor, true>: std::true_type {
+    using Operand = sm70_s884::Operand_B_Pack<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_884, OPERAND_V, T, kColMajor, true>: std::true_type {
+    using Operand = sm70_s884::Operand_V_Pack<T>;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h b/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
new file mode 100644
index 0000000000..7b36aa795a
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h
@@ -0,0 +1,249 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/arch/smem_copy_sm80.h"
+#include "src/turbomind/kernels/gemm/iterator.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <type_traits>
+
+namespace turbomind::gemm {
+
+namespace sm80_s16816 {
+
+namespace detail {
+
+struct GetSmemLayout {
+    template<int S, int C>
+    static constexpr auto apply(pair<S, C>)
+    {
+        // constexpr int S0 = S >= 16 ? 16 : 8;
+        constexpr int S0 = 8;
+        constexpr int C0 = C >= 64 ? 64 : (C >= 32 ? 32 : 16);
+        using _Small     = std::conditional_t<C0 == 32, Swizzle<2, 3, 3>, Swizzle<1, 3, 3>>;
+        using Swizzle    = std::conditional_t<C0 == 64, Swizzle<3, 3, 3>, _Small>;
+        return SmemLayoutV2<S, C, S0, C0, Swizzle>{};
+    }
+};
+
+}  // namespace detail
+
+template<Order order>
+struct GetSmemLayoutV2 {
+    template<int M, int K>
+    static constexpr auto apply(pair<M, K>)
+    {
+        constexpr int2 cs = mk2cs<order>(M, K);
+        return detail::GetSmemLayout::apply(pair<cs.y, cs.x>{});
+    }
+};
+
+// (m, k)
+template<class T, Order order>
+struct Operand_A {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = order;
+
+    // using SmemCopyAtom =
+    //     std::conditional_t<order == kRowMajor, SmemCopy_MMA_16816_A<T, false>, SmemCopy_MMA_16816_B<T, true>>;
+
+    // using SmemCopyAtom = std::conditional_t<order == kRowMajor,
+    //                                         LDSM_SM75_8x8<T, 16, 16, kColMajor, kRowMajor>,
+    //                                         LDSM_SM75_8x8<T, 16, 16, kRowMajor, kColMajor>>;
+
+    using SmemCopyAtom = LDSM_SM75_8x8<T, 16, 16, ~order, order>;
+
+    using GetSmemLayout = GetSmemLayoutV2<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+// (n, k)
+template<class T, Order order, int N>
+struct Operand_B {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = order;
+
+    // using SmemCopyAtom =
+    //     std::conditional_t<order == kRowMajor, SmemCopy_MMA_16816_B<T, false>, SmemCopy_MMA_16816_A<T, true>>;
+    // using SmemCopyAtom = std::conditional_t<order == kRowMajor,  //
+    //                                         LDSM_SM75_8x8<T, 16, 16, kRowMajor, kRowMajor>,
+    //                                         LDSM_SM75_8x8<T, 16, 16, kColMajor, kColMajor>>;
+
+    using SmemCopyAtom = LDSM_SM75_8x8<T, N, 16, order, order>;
+
+    using GetSmemLayout = GetSmemLayoutV2<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<Order order>
+struct _GetSmemLayoutC {
+    template<int M, int N>
+    static constexpr auto apply(pair<M, N>)
+    {
+        if constexpr (order == kRowMajor) {
+            // x01  23
+            // cccccss
+            //                                    bits base shift
+            return SmemLayoutV2<M, N, 8, 32, Swizzle<2, 3, 2>>{};
+        }
+        else {
+            // 234  x01
+            // 23401x
+            // cccccsss
+            // so that x is not part of swizzling
+            return SmemLayoutV2<N, M, 8, 32, Swizzle<2, 3, 3>>{};
+        }
+    }
+};
+
+template<Order order>
+struct _GetThreadMapC {
+    template<int M, int N, int THREADS>
+    static constexpr auto apply(pair<M, N>, constant<THREADS>)
+    {
+        constexpr auto cs    = mk2cs<order>(M, N);
+        constexpr int  WARPS = THREADS / WARP_SIZE;
+
+        return ThreadMap_V2<cs.x, cs.y, 4, Raked, WARPS>{};
+    }
+};
+
+template<class T, Order order>
+struct Operand_C {
+    using Dtype = T;
+
+    static constexpr Order kOrder = order;
+
+    using GetSmemLayout = _GetSmemLayoutC<order>;
+    using GetThreadMap  = _GetThreadMapC<order>;
+};
+
+template<class T>
+struct Operand_UV {
+    using Dtype = T;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = kColMajor;
+
+    using SmemCopyAtom = SmemCopy_MMA_16816_U<T>;
+
+    struct GetSmemLayout {
+        template<int M, int K>
+        static constexpr auto apply(pair<M, K>)
+        {
+            return SmemLayoutV2<K, M>{};
+        }
+    };
+    using GetGmemIter = GetGmemIter;
+};
+
+template<Order order>
+struct GetSmemLayout_Pack {
+    template<int M, int K>
+    static constexpr auto apply(pair<M, K>)
+    {
+        constexpr int2 CS = mk2cs<order>(M, K);
+        return SmemLayoutV2<CS.y, CS.x, 1, 1>{};
+    }
+};
+
+template<class T, Order order>
+struct Operand_A_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 2;
+
+    static constexpr Pack  kPack  = HMMA_16816 | OPERAND_A | Pack_M;
+    static constexpr Order kOrder = order;
+
+    // using SmemCopyAtom = SmemCopyAtom_Pack_v2<T, kOrder, 16 * Pack_M, 16, 8, Pack_M>;
+    using _SCp         = typename Operand_A<T, order>::SmemCopyAtom;
+    using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, _SCp, order, Pack_M>;
+
+    using GetSmemLayout = GetSmemLayout_Pack<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T, Order order>
+struct Operand_B_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 2;
+
+    static constexpr Pack  kPack  = HMMA_16816 | OPERAND_B | Pack_M;
+    static constexpr Order kOrder = order;
+
+    using SmemCopyAtom = SmemCopyAtom_Pack_v2<T, kOrder, 16 * Pack_M, 16, 8, Pack_M>;
+
+    using GetSmemLayout = GetSmemLayout_Pack<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+template<class T, bool is_V>
+struct Operand_UV_Pack {
+    using Dtype = T;
+
+    static constexpr int Pack_M = 1;
+
+    static constexpr Pack  kPack  = HMMA_16816 | (is_V ? OPERAND_V : OPERAND_U) | Pack_M;
+    static constexpr Order kOrder = Order::kColMajor;
+
+    using _SCp         = typename Operand_UV<T>::SmemCopyAtom;
+    using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, _SCp, kOrder, Pack_M>;
+
+    using GetSmemLayout = GetSmemLayout_Pack<kOrder>;
+    using GetGmemIter   = GetGmemIter;
+};
+
+}  // namespace sm80_s16816
+
+template<class T, Order order>
+struct GetOperand<HMMA_16816, OPERAND_A, T, order, false>: std::true_type {
+    using Operand = sm80_s16816::Operand_A<T, order>;
+};
+
+template<class T, Order order>
+struct GetOperand<HMMA_16816, OPERAND_B, T, order, false>: std::true_type {
+    using Operand = sm80_s16816::Operand_B<T, order, 16>;
+};
+
+template<class T>
+struct GetOperand<HMMA_16816, OPERAND_U, T, kColMajor, false>: std::true_type {
+    using Operand = sm80_s16816::Operand_UV<T>;
+};
+
+template<class T>
+struct GetOperand<HMMA_16816, OPERAND_V, T, kColMajor, false>: std::true_type {
+    using Operand = sm80_s16816::Operand_UV<T>;
+};
+
+// template<class T>
+// struct GetOperand<HMMA_16816, OPERAND_A, T, kColMajor, true>: std::true_type {
+//     using Operand = sm80_s16816::Operand_A_Pack<T, kColMajor>;
+// };
+
+// template<class T>
+// struct GetOperand<HMMA_16816, OPERAND_B, T, kColMajor, true>: std::true_type {
+//     using Operand = sm80_s16816::Operand_B_Pack<T, kColMajor>;
+// };
+
+// template<>
+// struct GetOperand<HMMA_16816, OPERAND_U, uint32_t, kColMajor, true>: std::true_type {
+//     using Operand = sm80_s16816::Operand_U_Pack<uint32_t>;
+// };
+
+// template<>
+// struct GetOperand<HMMA_16816, OPERAND_V, uint32_t, kColMajor, true>: std::true_type {
+//     using Operand = sm80_s16816::Operand_U_Pack<uint32_t>;
+// };
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/smem_copy_simt.h b/src/turbomind/kernels/gemm/arch/smem_copy_simt.h
new file mode 100644
index 0000000000..ecf20a17b7
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/smem_copy_simt.h
@@ -0,0 +1,102 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/gemm/simt.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+template<class T, int K_>
+struct SmemCopy_MMA_SIMT_A {
+    static constexpr int M = simt::OP_M;
+    static constexpr int K = simt::OP_K;
+
+    static constexpr int OP_N = simt::OP_N;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, K>;
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {lane_id / OP_N, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)  // -> (m, k)
+    {
+        Lds(*(Frag*)dst_ptr, (S &&) src_ptr);
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)  // -> (unique id, repeat id)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {pack_idx * M + lane_id / OP_N, lane_id % OP_N};
+    }
+};
+
+template<class T, int K_>
+struct SmemCopy_MMA_SIMT_B {
+    static constexpr int M = simt::OP_N;
+    static constexpr int K = simt::OP_K;
+
+    static constexpr int OP_N = simt::OP_N;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, K>;
+
+    __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {lane_id % OP_N, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        Lds(*(Frag*)dst_ptr, (S &&) src_ptr);
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)  // -> (unique id, repeat id)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N};
+    }
+};
+
+template<class T, int K_>
+struct SmemCopy_MMA_SIMT_V {
+    static constexpr int M = simt::OP_N;
+    static constexpr int K = K_;
+
+    static constexpr int OP_N = simt::OP_N;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 1>;
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N};
+    }
+
+    __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
+    {
+        return {unique(thread_idx, 0).x, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool mask)
+    {
+        Lds(*(Frag*)dst_ptr, src_ptr);
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/smem_copy_sm70.h b/src/turbomind/kernels/gemm/arch/smem_copy_sm70.h
new file mode 100644
index 0000000000..9ed25b45b4
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/smem_copy_sm70.h
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+
+namespace turbomind::gemm {
+
+template<class T>
+struct SmemCopy_MMA_884_A {
+    // static constexpr int M = 16;
+    // static constexpr int K = 8;
+    static constexpr int M = 8;
+    static constexpr int K = 8;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, K>;
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        //                   4                3               01
+        // const int m = lane_id / 16 * 4 + (lane_id & 8) + lane_id % 4;
+        // return {pack_idx * M + m, (lane_id & 4) >> 2};
+
+        //                   4                01
+        const int m = lane_id / 16 * 4 + lane_id % 4;
+        return {pack_idx * M + m, (lane_id & 12) >> 2};
+    }
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        return int2{unique(thread_idx, 0).x, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        Lds(*(Frag*)dst_ptr, src_ptr);
+    }
+};
+
+template<class T>
+struct SmemCopy_MMA_884_B {
+    // static constexpr int M = 16;
+    // static constexpr int K = 8;
+    static constexpr int M = 32;
+    static constexpr int K = 8;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, K>;
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        //                4                     2                 01
+        // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4;
+        // return {pack_idx * M + m, (lane_id & 8) >> 3};
+
+        //                  4                  23                  01
+        const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4;
+        return {pack_idx * M + m, 0};
+    }
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        return int2{unique(thread_idx, 0).x, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        Lds(*(Frag*)dst_ptr, src_ptr);
+    }
+};
+
+template<class T, int K_>
+struct SmemCopy_MMA_884_V {
+    // static constexpr int M = 16;
+    static constexpr int M = 32;
+    static constexpr int K = K_;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 1>;
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        //                4                     2                 01
+        // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4;
+        // return {pack_idx * 16 + m, (lane_id & 8) >> 3};
+
+        const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4;
+        return {pack_idx * M + m, 0};
+    }
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        return int2{unique(thread_idx, 0).x, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        Lds(*(Frag*)dst_ptr, src_ptr);
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/arch/smem_copy_sm80.h b/src/turbomind/kernels/gemm/arch/smem_copy_sm80.h
new file mode 100644
index 0000000000..e3dd1f0bac
--- /dev/null
+++ b/src/turbomind/kernels/gemm/arch/smem_copy_sm80.h
@@ -0,0 +1,207 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+template<bool trans>
+struct LDSM_x4 {
+    template<class S, class D>
+    __device__ static void apply(S src_ptr, D dst_ptr)
+    {
+        const uint32_t uint_ptr = cast_smem_ptr_to_uint(src_ptr);
+        if constexpr (trans) {
+            ldsm_x4_trans(*(Array<uint32_t, 4>*)dst_ptr, uint_ptr);
+        }
+        else {
+            ldsm_x4(*(Array<uint32_t, 4>*)dst_ptr, uint_ptr);
+        }
+    }
+};
+
+template<bool trans>
+struct LDSM_x2 {
+    template<class S, class D>
+    __device__ static void apply(S src_ptr, D dst_ptr)
+    {
+        const uint32_t uint_ptr = cast_smem_ptr_to_uint(src_ptr);
+        if constexpr (trans) {
+            ldsm_x2_trans(*(Array<uint32_t, 2>*)dst_ptr, uint_ptr);
+        }
+        else {
+            ldsm_x2(*(Array<uint32_t, 2>*)dst_ptr, uint_ptr);
+        }
+    }
+};
+
+template<bool trans>
+struct LDSM_x1 {
+    template<class S, class D>
+    __device__ static void apply(S src_ptr, D dst_ptr)
+    {
+        const uint32_t uint_ptr = cast_smem_ptr_to_uint(src_ptr);
+        if constexpr (trans) {
+            ldsm_x1_trans(*(Array<uint32_t, 1>*)dst_ptr, uint_ptr);
+        }
+        else {
+            ldsm_x1(*(Array<uint32_t, 1>*)dst_ptr, uint_ptr);
+        }
+    }
+};
+
+template<class T, bool trans>
+struct SmemCopy_MMA_16816_A {
+    static constexpr int M = 16;
+    static constexpr int K = 16;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 8>;
+
+    __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+
+        const int c = lane_id / 16 * 8;
+        const int s = lane_id % 16;
+
+        return trans ? int2{c, s} : int2{s, c};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        LDSM_x4<trans>::apply((S &&) src_ptr, (D &&) dst_ptr);
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        return {pack_idx * WARP_SIZE + thread_idx % WARP_SIZE, 0};
+    }
+};
+
+template<class T, bool trans>
+struct SmemCopy_MMA_16816_B {
+    static constexpr int M = 16;
+    static constexpr int K = 16;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 8>;
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+
+        const int c = lane_id / 8 * 8 % 16;
+        const int s = lane_id % 8 + lane_id / 16 * 8;
+
+        return trans ? int2{c, s} : int2{s, c};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        LDSM_x4<trans>::apply((S &&) src_ptr, (D &&) dst_ptr);
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        return {pack_idx * WARP_SIZE + thread_idx % WARP_SIZE, 0};
+    }
+};
+
+template<class T, int M_, int K_, Order mat_order, Order thr_order>
+struct LDSM_SM75_8x8 {
+    static constexpr int M = M_;
+    static constexpr int K = K_;
+
+    static constexpr int iM = M / 8;
+    static constexpr int iK = K / 8;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 2 * iM * iK>;
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        int       c, s;
+        if constexpr (mat_order == kColMajor) {
+            s = lane_id % 16;
+            c = lane_id / 16 * 8;
+        }
+        else {
+            s = lane_id / 16 * 8 + lane_id % 8;
+            c = lane_id & 8;
+        }
+        int2 mk = cs2mk<thr_order>(c, s);
+#if __CUDA_ARCH__ <= 750  // wrap ptrs around for sm_75
+        mk.x %= M;
+        mk.y %= K;
+#endif
+        return mk;
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
+    {
+        constexpr bool trans = thr_order != kRowMajor;
+        if constexpr (sizeof(Frag) == 16) {
+            LDSM_x4<trans>::apply((S &&) src_ptr, (D &&) dst_ptr);
+        }
+        else if constexpr (sizeof(Frag) == 8) {
+            LDSM_x2<trans>::apply((S &&) src_ptr, (D &&) dst_ptr);
+        }
+        else if constexpr (sizeof(Frag) == 4) {
+            LDSM_x1<trans>::apply((S &&) src_ptr, (D &&) dst_ptr);
+        }
+        else {
+            static_assert(sizeof(S) != sizeof(S), "not implemented");
+        }
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        return {pack_idx * WARP_SIZE + thread_idx % WARP_SIZE, 0};
+    }
+};
+
+template<class T>
+struct SmemCopy_MMA_16816_U {  // (M, K)
+    static constexpr int M = 16;
+    static constexpr int K = 1;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<T, 2>;
+
+    __device__ static int2 get_offset(int thread_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        // Note: this forbids sub-tile group sizes
+        return {lane_id / 4, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool mask)
+    {
+        PRAGMA_UNROLL
+        for (int i = 0; i < 2; ++i) {
+            Lds(*((Array<T, 1>*)dst_ptr + i), src_ptr + i * 8);
+        }
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+        return {pack_idx * 8 + lane_id / 4, lane_id % 4};
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/cast.cu b/src/turbomind/kernels/gemm/cast.cu
new file mode 100644
index 0000000000..730b4bb3bf
--- /dev/null
+++ b/src/turbomind/kernels/gemm/cast.cu
@@ -0,0 +1,196 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/math.h"
+
+namespace turbomind {
+
+template<class Ti, class To>
+struct Cast {
+    template<int N>
+    __device__ static Array<To, N> apply(const Array<Ti, N>& vi)
+    {
+        Array<To, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            vo[i] = static_cast<To>(vi[i]);
+        }
+        return vo;
+    }
+};
+
+template<class Ti>
+struct Cast<Ti, uint4_t> {
+    template<int N>
+    __device__ static Array<uint4_t, N> apply(const Array<Ti, N>& vi)
+    {
+        static_assert(N % 8 == 0);
+        Array<uint4_t, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 8) {
+            uint32_t& v = (uint32_t&)vo[i];
+            v           = 0;
+            PRAGMA_UNROLL
+            for (int j = 7; j >= 0; --j) {
+                v = (v << 4) | vi[i + j];
+            }
+        }
+        return vo;
+    }
+};
+
+template<class To>
+struct Cast<uint4_t, To> {
+    template<int N>
+    __device__ static Array<To, N> apply(const Array<uint4_t, N>& vi)
+    {
+        static_assert(N % 8 == 0);
+        Array<To, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 8) {
+            uint32_t v = (const uint32_t&)vi[i];
+            PRAGMA_UNROLL
+            for (int j = 0; j < 8; ++j) {
+                vo[i + j] = (v & 0xf);
+                v >>= 4;
+            }
+        }
+        return vo;
+    }
+};
+
+template<>
+struct Cast<uint4_t, uint4_t> {
+    template<int N>
+    __device__ static Array<uint4_t, N> apply(const Array<uint4_t, N>& vi)
+    {
+        return vi;
+    }
+};
+
+template<int VecSize, class Ti, class To>
+__global__ void cast_kernel(To* dst, const Ti* src, size_t n)
+{
+    n /= VecSize;
+
+    auto p_src = (const Array<Ti, VecSize>*)src;
+    auto p_dst = (Array<To, VecSize>*)dst;
+
+    for (size_t p = threadIdx.x + blockDim.x * blockIdx.x; p < n; p += blockDim.x * gridDim.x) {
+        Array<Ti, VecSize> vi;
+        Ldg(vi, (const Ti*)&p_src[p]);
+
+        Array<To, VecSize> vo = Cast<Ti, To>::apply(vi);
+
+        Store((To*)&p_dst[p], vo);
+    }
+}
+
+template<int VecSize, class Ti, class To>
+void invokeCast(To* dst, const Ti* src, size_t n, cudaStream_t st)
+{
+    cast_kernel<VecSize><<<256, 256, 0, st>>>(dst, src, n);
+}
+
+void extend_to_u8(uint8_t* dst, const uint4_t* src, size_t n, cudaStream_t st)
+{
+    invokeCast<8>(dst, src, n, st);
+}
+
+void compact_to_u4(uint4_t* dst, const uint8_t* src, size_t n, cudaStream_t st)
+{
+    invokeCast<8>(dst, src, n, st);
+}
+
+void extend_to_u16(uint16_t* dst, const uint4_t* src, size_t n, cudaStream_t st)
+{
+    invokeCast<8>(dst, src, n, st);
+}
+
+template<int VecSize, class T>
+__global__ void fuse_scales_and_zeros_kernel(T* fused, const T* scales, T* zeros, size_t n)
+{
+    n /= VecSize;
+
+    auto p_scales = (const Array<T, VecSize>*)scales;
+    auto p_zeros  = (const Array<T, VecSize>*)zeros;
+
+    auto p_fused = (Array<T, VecSize * 2>*)fused;
+
+    for (size_t p = threadIdx.x + blockDim.x * blockIdx.x; p < n; p += blockDim.x * gridDim.x) {
+        Array<T, VecSize> vs;
+        Ldg(vs, (const T*)&p_scales[p]);
+        Array<T, VecSize> vz{};
+        if (zeros) {
+            Ldg(vz, (const T*)&p_zeros[p]);
+        }
+        Array<T, VecSize * 2> vf;
+        PRAGMA_UNROLL
+        for (int i = 0; i < VecSize; ++i) {
+            vf[i * 2]     = vs[i];
+            vf[i * 2 + 1] = -vz[i] * vs[i];
+        }
+        Store((T*)&p_fused[p], vf);
+    }
+}
+
+void fuse_scales_and_zeros(half* fused, const half* scales, half* zeros, size_t n, cudaStream_t st)
+{
+    fuse_scales_and_zeros_kernel<4><<<256, 256, 0, st>>>(fused, scales, zeros, n);
+}
+
+template<int VecSize, class T>
+__global__ void
+interleave_output_dims_kernel(T* __restrict__ fused, const T* __restrict__ a, const T* __restrict__ b, int m, int k)
+{
+    using Vec1 = Array<T, VecSize>;
+
+    const int ki = blockIdx.y;
+
+    auto p_a = reinterpret_cast<const Vec1*>(a + ki * m);
+    auto p_b = reinterpret_cast<const Vec1*>(b + ki * m);
+
+    using Vec2 = Array<T, VecSize * 2>;
+
+    auto p_f = reinterpret_cast<Vec2*>(fused + ki * m * 2);
+
+    m /= VecSize;
+
+    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    for (int64_t mi = tidx; mi < m; mi += blockDim.x * gridDim.x) {
+        Vec1 va;
+        Vec1 vb;
+        Ldg(va, (const T*)&p_a[mi]);
+        Ldg(vb, (const T*)&p_b[mi]);
+        Vec2 vc;
+        PRAGMA_UNROLL
+        for (int i = 0; i < VecSize; ++i) {
+            vc[i * 2]     = va[i];
+            vc[i * 2 + 1] = vb[i];
+        }
+        Store((T*)&p_f[mi], vc);
+    }
+}
+
+template<class T>
+void interleave_output_dims_impl(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st)
+{
+    constexpr int kVecSize = std::min(8, 128 / (bitsof<T> * 2));
+
+    constexpr int block = 256;
+    const dim3    grid(1, k);  // x is a grid stride loop
+
+    interleave_output_dims_kernel<kVecSize><<<grid, block, 0, st>>>(fused, a, b, m, k);
+}
+
+template void
+interleave_output_dims_impl(uint8_t* fused, const uint8_t* a, const uint8_t* b, int m, int k, cudaStream_t st);
+template void
+interleave_output_dims_impl(uint16_t* fused, const uint16_t* a, const uint16_t* b, int m, int k, cudaStream_t st);
+template void
+interleave_output_dims_impl(uint32_t* fused, const uint32_t* a, const uint32_t* b, int m, int k, cudaStream_t st);
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/cast.h b/src/turbomind/kernels/gemm/cast.h
new file mode 100644
index 0000000000..d7f039ce7d
--- /dev/null
+++ b/src/turbomind/kernels/gemm/cast.h
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/core/data_type.h"
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+void extend_to_u8(uint8_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {});
+
+void extend_to_u16(uint16_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {});
+
+void compact_to_u4(uint4_t* dst, const uint8_t* src, size_t n, cudaStream_t st = {});
+
+void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st = {});
+
+void fuse_scales_and_zeros(half* fused, const half* scales, half* zeros, size_t n, cudaStream_t st = {});
+
+template<class T>
+void interleave_output_dims_impl(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st);
+
+template<class T>
+inline void interleave_output_dims(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st)
+{
+    auto dispatch = [&](auto u) {
+        using U = decltype(u);
+        return interleave_output_dims_impl((U*)fused, (const U*)a, (const U*)b, m, k, st);
+    };
+    if constexpr (bitsof<T> == 8) {
+        return dispatch(uint8_t{});
+    }
+    else if constexpr (bitsof<T> == 16) {
+        return dispatch(uint16_t{});
+    }
+    else if constexpr (bitsof<T> == 32) {
+        return dispatch(uint32_t{});
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
new file mode 100644
index 0000000000..7c26a10036
--- /dev/null
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -0,0 +1,241 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/attention/quantization.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/arch/operand_simt.h"
+#include "src/turbomind/kernels/gemm/arch/operand_sm70_s884.h"
+#include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/convert_v2.h"
+#include "src/turbomind/kernels/gemm/format.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+namespace {
+
+constexpr bool is_AB(Op_Tag op)
+{
+    if (op == OPERAND_A || op == OPERAND_B) {
+        return true;
+    }
+    else {
+        return false;
+    }
+}
+
+constexpr bool is_UV(Op_Tag op)
+{
+    return !is_AB(op);
+}
+
+template<class Dtype>
+constexpr int unit_size(basic_type<Dtype>)
+{
+    return 1;
+}
+
+constexpr int unit_size(basic_type<uint8_t>)
+{
+    return 4;
+}
+
+constexpr int unit_size(basic_type<uint4_t>)
+{
+    return 8;
+}
+
+}  // namespace
+
+// MMA     : H_16816, H_1688, H_884, H_SIMT
+// Operand : A, B, U, V
+// Order   : row, col
+// Dtype   : u16, u8, u4 (u6, u3)
+// PackNum : 1, 2, 4
+
+template<class Operand, class Dtype_, int PackNum>
+struct Config {
+    static constexpr int CTA_M = 64;
+    static constexpr int CTA_K = 32;
+
+    static constexpr int BLOCK_SIZE = 32;
+
+    using Stype = typename Operand::Dtype;
+    using Dtype = Dtype_;
+
+    using Kernel = ConvertOperand<CTA_M, CTA_K, PackNum, Operand, Dtype, Converter<Stype, Dtype>>;
+};
+
+template<class Config>
+void Convert_v2_Impl(const void* S, const MatrixLayout& Sdesc, void* D, const MatrixLayout& Ddesc, cudaStream_t stream)
+{
+    using Kernel = typename Config::Kernel;
+    using Stype  = typename Config::Stype;
+    using Dtype  = typename Config::Dtype;
+
+    constexpr int CTA_M = Config::CTA_M;
+
+    static constexpr int kSmemSize = sizeof(typename Kernel::SharedStorage);
+
+    if (kSmemSize > (48 << 10)) {
+        cudaFuncSetAttribute(convert_kernel<Kernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
+    }
+
+    using PointerD = typename Kernel::PtrD;
+    typename Kernel::Param param{Sdesc.rows, Sdesc.cols, (const Stype*)S, Sdesc.ld, PointerD{(Dtype*)D}, Ddesc.ld};
+
+    constexpr int threads = Config::BLOCK_SIZE;
+    const int     blocks  = ceil_div(Sdesc.rows, CTA_M);
+
+    // std::cout << __PRETTY_FUNCTION__ << std::endl;
+    // std::cout << __PRETTY_FUNCTION__ << "\nThreadMap:\n";
+    // Print(typename Kernel::GmemIter::ThreadMap{});
+
+    convert_kernel<Kernel><<<blocks, threads, kSmemSize, stream>>>(param);
+}
+
+int Convert(const void*         S,  //
+            const MatrixLayout& _Sdesc,
+            void*               D,
+            const MatrixLayout& _Ddesc,
+            cudaStream_t        stream)
+{
+    const Op_Tag op_tag = get_operand_tag(_Ddesc.pack);
+    const bool   trans  = op_tag == OPERAND_B || op_tag == OPERAND_V;
+
+    // (k, n) -> (n, k)
+    MatrixLayout Sdesc = trans ? transpose(_Sdesc) : _Sdesc;
+    MatrixLayout Ddesc = trans ? transpose(_Ddesc) : _Ddesc;
+
+    auto invoke = [&](auto mma, auto operand, auto order, auto stype, auto dtype, auto pack_num) -> bool {
+        using Stype = typename decltype(stype)::type;
+        using Dtype = typename decltype(dtype)::type;
+
+        if constexpr (GetOperand<mma, operand, Stype, order, false>::value) {  // is operand exist?
+
+            // Make args constexpr explicitly, some compilers failed to see const-ness of the args
+            constexpr int pack_num_tag = pack_num;
+
+            using Operand = typename GetOperand<mma, operand, Stype, order, false>::Operand;
+
+            static constexpr int  kPackSize = Operand::SmemCopyAtom::Frag::size() * pack_num_tag;
+            static constexpr bool kIsValid  = kPackSize % unit_size(type_c<Dtype>) == 0;
+
+            if constexpr (kIsValid) {
+                Convert_v2_Impl<Config<Operand, Dtype, pack_num_tag>>(S, Sdesc, D, Ddesc, stream);
+                return true;
+            }
+
+            // std::cerr << __PRETTY_FUNCTION__ << "\n";
+            // std::cerr << kPackSize << " " << unit_size(type_c<Dtype>) << "\n";
+        }
+
+        return false;
+    };
+
+    auto dispatch_4 = [&](auto mma, auto operand, auto order, auto stype, auto dtype) -> bool {
+        switch (get_pack_num(Ddesc.pack)) {
+            case 1:
+                return invoke(mma, operand, order, stype, dtype, constant<1>{});
+            case 2:
+                return invoke(mma, operand, order, stype, dtype, constant<2>{});
+            case 4:
+                return invoke(mma, operand, order, stype, dtype, constant<4>{});
+            default:
+                return false;
+        }
+    };
+
+    auto dispatch_3 = [&](auto mma, auto operand, auto order) -> bool {
+        if constexpr (is_AB(operand)) {
+            switch (Ddesc.type) {
+                case DataType::F16:
+                    return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint16_t>);
+                case DataType::U8:
+                    return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint8_t>);
+                case DataType::U4:
+                    return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint4_t>);
+                default:
+                    return false;
+            }
+        }
+        else {  // UV: U16, U32
+            switch (Ddesc.type) {
+                case DataType::U32:
+                    return dispatch_4(mma, operand, order, type_c<uint32_t>, type_c<uint32_t>);
+                default:
+                    return false;
+            }
+        }
+
+        return false;
+    };
+
+    auto dispatch_2 = [&](auto mma, auto operand) -> bool {
+        switch (Ddesc.order) {
+            case Order::kRowMajor:
+                return dispatch_3(mma, operand, constant<kRowMajor>{});
+            case Order::kColMajor:
+                return dispatch_3(mma, operand, constant<kColMajor>{});
+        }
+        return false;
+    };
+
+    auto dispatch_1 = [&](auto mma) -> bool {
+        /// TODO: add U, V
+        switch (get_operand_tag(Ddesc.pack)) {
+            case OPERAND_A:
+                return dispatch_2(mma, constant<OPERAND_A>{});
+            case OPERAND_B:
+                return dispatch_2(mma, constant<OPERAND_B>{});
+            case OPERAND_U:
+                return dispatch_2(mma, constant<OPERAND_U>{});
+            case OPERAND_V:
+                return dispatch_2(mma, constant<OPERAND_V>{});
+            default:
+                return false;
+        }
+    };
+
+    auto dispatch = [&]() -> bool {
+        /// TODO: add HMMA_1688, HMMA_884, HMMA_SIMT
+        switch (get_mma_tag(Ddesc.pack)) {
+            case HMMA_16816:
+                return dispatch_1(constant<HMMA_16816>{});
+            case HMMA_SIMT:
+                return dispatch_1(constant<HMMA_SIMT>{});
+            case HMMA_884:
+                return dispatch_1(constant<HMMA_884>{});
+            default:
+                return false;
+        }
+    };
+
+    // -1 on failure
+    return dispatch() - 1;
+}
+
+std::tuple<Order, Pack, Order, Pack> get_weight_and_scales_layout(int sm, bool force_simt)
+{
+    if (force_simt) {
+        return {kColMajor, HMMA_SIMT | OPERAND_B | 1, kRowMajor, HMMA_SIMT | OPERAND_V | 1};
+    }
+    if (sm >= 80) {
+        return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+    }
+    else if (sm == 75) {
+        return {kRowMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
+    }
+    else if (sm == 70) {
+        return {kColMajor, HMMA_884 | OPERAND_B | 1, kRowMajor, HMMA_884 | OPERAND_V | 1};
+    }
+    else {
+        std::cerr << "not implemented: sm_" << sm << std::endl;
+        std::abort();
+    }
+    return {};
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/convert_v2.h b/src/turbomind/kernels/gemm/convert_v2.h
new file mode 100644
index 0000000000..7819517fbd
--- /dev/null
+++ b/src/turbomind/kernels/gemm/convert_v2.h
@@ -0,0 +1,201 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/cp_async.h"
+#include "src/turbomind/kernels/gemm/iterator_sm70.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cuda_pipeline_primitives.h>
+
+template<class T>
+__device__ void print_type(T)
+{
+    if (threadIdx.x == 0) {
+        printf("%s\n", __PRETTY_FUNCTION__);
+    }
+}
+
+namespace turbomind::gemm {
+
+template<int M_, int K_, int Pack_M, class Operand_, class Td, class Converter>
+struct ConvertOperand {
+
+    static constexpr int M = M_;
+    static constexpr int K = K_;
+
+    using Operand = MakeOperand<Operand_, IteratorSm70<cache_policy::Default>, M_, K_, 1>;
+
+    using Ts         = typename Operand::Dtype;
+    using SmemLayout = typename Operand::SmemLayout;
+    using GmemIter   = typename Operand::GmemIter;
+
+    using Atom = typename Operand::SmemCopyAtom;
+
+    using SmemCopy = SmemCopy<Operand, M_ / Atom::M, K_ / Atom::K, Atom::M, Atom::K>;
+
+    using Accessor = SmemAccessor<Ts, SmemLayout>;
+
+    static constexpr auto kOrderS = Operand::kOrder;
+
+    static constexpr int ITER_K = ceil_div(K, Atom::K);
+
+    /// TODO: generailize this
+    static constexpr int WARP_CNT = 1;
+
+    using PtrD = get_pointer_type<Td>;
+
+    struct Param {
+        int       m;
+        int       k;
+        const Ts* src;
+        int       lds;
+        PtrD      dst;
+        int       ldd;
+    };
+
+    using SharedStorage = Array<Ts, SmemLayout::kSize>;
+
+    template<class T, int N, int M>
+    static constexpr int get_fragment_size(Array<T, N> (&)[M])
+    {
+        return N;
+    }
+
+    template<class T, int N, int M>
+    static constexpr int get_fragment_num(Array<T, N> (&)[M])
+    {
+        return M;
+    }
+
+    __device__ constexpr int2 _mk2cs(int m, int k)
+    {
+        return mk2cs<kOrderS>(m, k);
+    }
+
+    __device__ void operator()(const Param& param, char* smem_buf)
+    {
+        Ts* smem = (Ts*)smem_buf;
+
+        const int cta_cnt_m = ceil_div(param.m, M);
+        const int cta_cnt_k = ceil_div(param.k, K);
+
+        const int cta_idx_m = blockIdx.x;
+
+        const int cta_offset_m = cta_idx_m * M;
+        const int residue_m    = min(M, param.m - cta_offset_m);
+
+        const int warp_id = threadIdx.x / WARP_SIZE;
+
+        const int warp_offset_m = 0;
+
+        Converter converter{};
+
+        typename SmemCopy::Frag data;
+
+        constexpr int kFragSize = get_fragment_size(data);
+        constexpr int kFragNum  = get_fragment_num(data);
+        constexpr int kPackSize = kFragSize * Pack_M;
+
+        const int pack_cnt_k = ceil_div(param.k, Atom::K);
+        const int pack_cnt_m = ceil_div(param.m, Atom::M * Pack_M);
+
+        if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+            // printf("m=%d, k=%d, lds = %d\n", param.m, param.k, param.lds);
+            // printf(
+            //     "CTA_M=%d, CTA_K=%d, cta_cnt_m=%d, cta_cnt_k=%d, cta_idx_m=%d, ITER_K=%d, pack_cnt_m=%d,
+            //     pack_cnt_k=%d\n", M_, K_, cta_cnt_m, cta_cnt_k, cta_idx_m, ITER_K, pack_cnt_m, pack_cnt_k);
+            // printf("frag_size=%d, frag_num=%d, pack_size=%d\n", kFragSize, kFragNum, kPackSize);
+        }
+
+        const int cta_offset_k = (cta_cnt_k - 1) * K;
+        const int residue_k    = min(K, param.k - cta_offset_k);
+
+        // Handle residue k first
+        GmemIter gmem{(Ts*)param.src, param.lds, {cta_offset_m, cta_offset_k}, {residue_m, residue_k}};
+
+        gmem.smem_data_ = smem;
+        gmem.ClearSmem();
+
+        __syncthreads();
+
+        gmem.Prefetch(true);
+
+        // Rest full k tiles
+        gmem            = GmemIter{(Ts*)param.src, param.lds, {cta_offset_m, 0}, {residue_m, K}};
+        gmem.smem_data_ = smem;
+
+        SmemCopy smem_copy({warp_offset_m, 0});
+
+        // last, 0, 1, 2, 3, ..., last - 1
+        int cta_idx_k = cta_cnt_k - 1;
+
+        for (int k_stage = 0; k_stage < cta_cnt_k; ++k_stage) {
+            __syncthreads();
+
+            PRAGMA_UNROLL
+            for (int k = 0; k < ITER_K; ++k) {
+                // Assuming `SmemCopy` is a warp-level operation
+                // Load from smem as we are doing GEMMs
+                // SmemCopy::copy(smem, data, int2{warp_offset_m, 0}, k);
+                smem_copy(smem, data, k);
+
+                PRAGMA_UNROLL
+                for (int m = 0; m < kFragNum; m += Pack_M) {
+                    // Convert and pack rmem data
+                    Array<Td, kPackSize> packed = converter((Array<Ts, kPackSize>&)data[m]);
+
+                    // Logical pack coords
+                    const int pack_idx_k = cta_idx_k * ITER_K + k;
+                    const int pack_idx_m = ((cta_idx_m * WARP_CNT + warp_id) * kFragNum + m) / Pack_M;
+
+                    // Linear pack index
+                    const int pack_index = cs2idx(_mk2cs(pack_idx_m, pack_idx_k),  //
+                                                  _mk2cs(pack_cnt_m, pack_cnt_k).x);
+
+                    auto [unique_id, repeat_id] = Atom::unique(threadIdx.x, pack_index);
+
+                    // Store in [pack_id, lane_id], static cast is needed to decay SubBytePtr<T> to T*
+                    auto dst_ptr = static_cast<Td*>(param.dst + unique_id * kPackSize);
+
+                    if (pack_idx_m < pack_cnt_m && pack_idx_k < pack_cnt_k && repeat_id == 0) {
+                        Store(dst_ptr, packed);
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            if (k_stage == cta_cnt_k - 1) {
+                break;
+            }
+
+            gmem.Prefetch(true);
+            gmem.Advance();
+
+            cta_idx_k = k_stage;
+        }
+    }
+
+    __device__ void print(...) {}
+
+    __device__ void print(Array<uint32_t, 2> _x)
+    {
+        auto& x = (const Array<half, 4>&)_x;
+        printf("tidx=%d, %f %f %f %f\n", (int)threadIdx.x, (float)x[0], (float)x[1], (float)x[2], (float)x[3]);
+    }
+};
+
+extern __shared__ char smem_buf[];
+
+template<class Kernel>
+__global__ void convert_kernel(typename Kernel::Param param)
+{
+    Kernel kernel;
+    kernel(param, smem_buf);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/cp_async.h b/src/turbomind/kernels/gemm/cp_async.h
new file mode 100644
index 0000000000..bbf976b777
--- /dev/null
+++ b/src/turbomind/kernels/gemm/cp_async.h
@@ -0,0 +1,211 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <type_traits>
+
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+#define L2_CACHEHINT(size) ".L2::" #size "B"
+#else
+#define L2_CACHEHINT(size)
+#endif
+
+namespace turbomind {
+
+enum class CacheOp
+{
+    kDefault,  // use global when possible
+    kAlways,
+    kGlobal,
+};
+
+template<CacheOp cache_op, int size>
+struct GetCacheOp {
+    static constexpr auto value = cache_op;
+};
+
+template<>
+struct GetCacheOp<CacheOp::kDefault, 16> {
+    static constexpr auto value = CacheOp::kGlobal;
+};
+
+template<int size>
+struct GetCacheOp<CacheOp::kDefault, size> {
+    static constexpr auto value = CacheOp::kAlways;
+};
+
+enum class EvictPolicy
+{
+    kEvictNormal,
+    kEvictFirst,
+    kEvictLast,
+};
+
+namespace cache_policy {
+
+struct Default {
+    static constexpr auto kCacheOp     = CacheOp::kDefault;
+    static constexpr auto kEvictPolicy = EvictPolicy::kEvictNormal;
+};
+
+struct Stream {
+    static constexpr auto kCacheOp     = CacheOp::kDefault;
+    static constexpr auto kEvictPolicy = EvictPolicy::kEvictFirst;
+};
+
+struct Reuse {
+    static constexpr auto kCacheOp     = CacheOp::kAlways;
+    static constexpr auto kEvictPolicy = EvictPolicy::kEvictNormal;
+};
+
+};  // namespace cache_policy
+
+template<CacheOp, int size, int prefetch_size>
+struct CP_ASYNC {
+};
+
+template<int prefetch_size>
+struct CP_ASYNC<CacheOp::kGlobal, 16, prefetch_size> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global [%1], [%2], 16;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global.L2::cache_hint [%1], [%2], 16, %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<>
+struct CP_ASYNC<CacheOp::kGlobal, 16, 64> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global" L2_CACHEHINT(64) " [%1], [%2], 16;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global.L2::cache_hint" L2_CACHEHINT(64) " [%1], [%2], 16, %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<>
+struct CP_ASYNC<CacheOp::kGlobal, 16, 128> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], 16;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global.L2::cache_hint" L2_CACHEHINT(128) " [%1], [%2], 16, %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<>
+struct CP_ASYNC<CacheOp::kGlobal, 16, 256> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global" L2_CACHEHINT(256) " [%1], [%2], 16;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.cg.shared.global.L2::cache_hint" L2_CACHEHINT(256) " [%1], [%2], 16, %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<int size, int prefetch_size>
+struct CP_ASYNC<CacheOp::kAlways, size, prefetch_size> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global.L2::cache_hint [%1], [%2], %3, %4;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<int size>
+struct CP_ASYNC<CacheOp::kAlways, size, 64> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global" L2_CACHEHINT(64) " [%1], [%2], %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global.L2::cache_hint" L2_CACHEHINT(64) " [%1], [%2], %3, %4;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<int size>
+struct CP_ASYNC<CacheOp::kAlways, size, 128> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global.L2::cache_hint" L2_CACHEHINT(128) " [%1], [%2], %3, %4;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+template<int size>
+struct CP_ASYNC<CacheOp::kAlways, size, 256> {
+    // clang-format off
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global" L2_CACHEHINT(256) " [%1], [%2], %3;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size));
+    }
+    __device__ static void apply(int smem_ptr, const void* __restrict__ src, uint64_t cache_policy, bool mask)
+    {
+        asm volatile("{\n  .reg .pred p;\n  setp.ne.b32 p, %0, 0;\n"
+                     "  @p cp.async.ca.shared.global.L2::cache_hint" L2_CACHEHINT(256) " [%1], [%2], %3, %4;\n"
+                     "}\n" ::"r"((int)mask), "r"(smem_ptr), "l"(src), "n"(size), "l"(cache_policy));
+    }
+    // clang-format on
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/cta_map.h b/src/turbomind/kernels/gemm/cta_map.h
new file mode 100644
index 0000000000..d73c3142d0
--- /dev/null
+++ b/src/turbomind/kernels/gemm/cta_map.h
@@ -0,0 +1,86 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+
+namespace turbomind::gemm {
+
+struct CtaMap {
+
+    TM_HOST_DEVICE static int3 get_tiled_shape(int m, int n, int k, int cta_m, int cta_n, int split_cnt)
+    {
+        return {(m + cta_m - 1) / cta_m, (n + cta_n - 1) / cta_n, split_cnt};
+    }
+
+    TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int N)
+    {
+        auto n = tiled_shape.y;
+        if (N >= 32 && n >= 24)
+            return 5;
+        if (N >= 16 && n >= 12)
+            return 4;
+        if (N >= 8 && n >= 6)
+            return 3;
+        if (N >= 4 && n >= 3)
+            return 2;
+        if (N >= 2 && n >= 2)
+            return 1;
+        return 0;
+    }
+
+    TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
+    {
+        int tile = 1 << log_tile;
+        return {static_cast<unsigned>(tiled_shape.x * tile),
+                static_cast<unsigned>((tiled_shape.y + tile - 1) / tile),
+                static_cast<unsigned>(tiled_shape.z)};
+    }
+
+    TM_DEVICE static int3 get_tile_offset(int log_tile)
+    {
+        int block_idx_x = blockIdx.x;
+        int block_idx_y = blockIdx.y;
+        int block_idx_z = blockIdx.z;
+        return {(block_idx_x >> log_tile),  //
+                (block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)),
+                block_idx_z};
+    }
+};
+
+struct CtaMapN: public CtaMap {
+    TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
+    {
+        int tile = 1 << log_tile;
+        return {static_cast<unsigned>(tiled_shape.y * tile),               // n * tile
+                static_cast<unsigned>((tiled_shape.x + tile - 1) / tile),  // m / tile
+                static_cast<unsigned>(tiled_shape.z)};
+    }
+    TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int M)
+    {
+        auto m = tiled_shape.x;
+        if (M >= 32 && m >= 24)
+            return 5;
+        if (M >= 16 && m >= 12)
+            return 4;
+        if (M >= 8 && m >= 6)
+            return 3;
+        if (M >= 4 && m >= 3)
+            return 2;
+        if (M >= 2 && m >= 2)
+            return 1;
+        return 0;
+    }
+    TM_DEVICE static int3 get_tile_offset(int log_tile)
+    {
+        int block_idx_x = blockIdx.x;
+        int block_idx_y = blockIdx.y;
+        int block_idx_z = blockIdx.z;
+        return {(block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)),  //
+                (block_idx_x >> log_tile),
+                block_idx_z};
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/desc.h b/src/turbomind/kernels/gemm/desc.h
new file mode 100644
index 0000000000..933a4b0201
--- /dev/null
+++ b/src/turbomind/kernels/gemm/desc.h
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+// aggregate that uniquely identifies a GEMM problem
+struct GemmDesc {
+    int       arch;
+    DataType  type_a;
+    DataType  type_b;
+    DataType  type_c;
+    Order     order_a;
+    Order     order_b;
+    Order     order_c;
+    Pack      pack_a;
+    Pack      pack_b;
+    Pack      pack_u;
+    Pack      pack_v;
+    QuantDesc quant_a;
+    QuantDesc quant_b;
+    Epilogue  epilogue;
+    int       m;
+    int       n;
+    int       k;
+    int       batch_dim;
+};
+
+enum class OpClass
+{
+    kSIMT,
+    kMMA_s884,
+    kMMA_s16816,
+};
+
+inline const char* to_string(OpClass op)
+{
+    switch (op) {
+        case OpClass::kSIMT:
+            return "simt";
+        case OpClass::kMMA_s884:
+            return "s884";
+        case OpClass::kMMA_s16816:
+            return "s16816";
+        default:
+            return "unknown_op_cls";
+    }
+}
+
+// aggregate that uniquely identifies a kernel
+struct KernelDesc {
+    int       arch;
+    OpClass   op_class;
+    DataType  type_a;
+    DataType  type_b;
+    DataType  type_c;
+    Order     order_a;
+    Order     order_b;
+    Order     order_c;
+    Pack      pack_a;
+    Pack      pack_b;
+    Pack      pack_u;
+    Pack      pack_v;
+    QuantDesc quant_a;
+    QuantDesc quant_b;
+    int       policy_a;
+    int       policy_b;
+    int3      cta_tile;
+    int3      mma_tile;
+    int3      align;
+    int2      c_tile;
+    int       stages;
+    bool      split_k;
+
+    // set by `KernelImpl`
+    int                max_active_ctas;
+    cudaFuncAttributes attr;
+};
+
+class Kernel;
+struct LaunchSpec {
+    Kernel* kernel;
+    int     swizzle;
+    int     splits;
+    float   estimated;
+    float   measured;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/dispatch_cache.cu b/src/turbomind/kernels/gemm/dispatch_cache.cu
new file mode 100644
index 0000000000..850cf3a51f
--- /dev/null
+++ b/src/turbomind/kernels/gemm/dispatch_cache.cu
@@ -0,0 +1,414 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/dispatch_cache.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <vector>
+
+static inline bool operator==(const int3& a, const int3& b)
+{
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+static inline bool operator==(const int2& a, const int2& b)
+{
+    return a.x == b.x && a.y == b.y;
+}
+
+namespace turbomind::gemm {
+
+static inline decltype(auto) as_tuple(const KernelDesc& d)
+{
+    return std::tie(d.arch,
+                    d.op_class,
+                    d.type_a,
+                    d.type_b,
+                    d.type_c,
+                    d.order_a,
+                    d.order_b,
+                    d.order_c,
+                    d.pack_a,
+                    d.pack_b,
+                    d.pack_u,
+                    d.pack_v,
+                    d.quant_a,
+                    d.quant_b,
+                    d.policy_a,
+                    d.policy_b,
+                    d.cta_tile,
+                    d.mma_tile,
+                    d.align,
+                    d.c_tile,
+                    d.stages,
+                    d.split_k);
+}
+
+static inline bool operator==(const QuantDesc& a, const QuantDesc& b)
+{
+    return a.type == b.type && a.group_size == b.group_size;
+}
+
+static inline bool operator==(const KernelDesc& a, const KernelDesc& b)
+{
+    return as_tuple(a) == as_tuple(b);
+}
+
+template<class... Ts>
+static inline void export_impl(std::ostream& os, const Ts&... ts)
+{
+    ((os << static_cast<int>(ts) << " "), ...);
+}
+
+template<class T>
+static inline void import_value(std::istream& is, T& value)
+{
+    int token{};
+    is >> token;
+    value = static_cast<T>(token);
+}
+
+template<class... Ts>
+static inline void import_impl(std::istream& is, Ts&... ts)
+{
+    (import_value(is, ts), ...);
+}
+
+void ExportDispatchCache(std::ostream& os, const std::vector<std::pair<GemmDesc, LaunchSpec>>& entries)
+{
+    for (const auto& [g, spec] : entries) {
+        // GEMM desc
+        export_impl(os,
+                    g.arch,
+                    g.type_a,
+                    g.type_b,
+                    g.type_c,
+                    g.order_a,
+                    g.order_b,
+                    g.order_c,
+                    g.pack_a,
+                    g.pack_b,
+                    g.pack_u,
+                    g.pack_v,
+                    g.quant_a.type,
+                    g.quant_a.group_size,
+                    g.quant_b.type,
+                    g.quant_b.group_size,
+                    g.epilogue,
+                    g.m,
+                    g.n,
+                    g.k,
+                    g.batch_dim);
+        // Kernel desc
+        auto& k = spec.kernel->desc();
+        export_impl(os,
+                    k.arch,
+                    k.op_class,
+                    k.cta_tile.x,
+                    k.cta_tile.y,
+                    k.cta_tile.z,
+                    k.mma_tile.x,
+                    k.mma_tile.y,
+                    k.mma_tile.z,
+                    k.stages,
+                    k.align.x,
+                    k.align.y,
+                    k.align.z,
+                    k.policy_a,
+                    k.policy_b,
+                    k.c_tile.x,
+                    k.c_tile.y,
+                    k.split_k);
+        // Runtime params
+        export_impl(os, spec.swizzle, spec.splits);
+        os << std::endl;
+    }
+}
+
+void ImportDispatchCache(std::istream&                                 is,
+                         std::vector<std::pair<GemmDesc, LaunchSpec>>& entries,
+                         const std::vector<Kernel*>&                   kernels)
+{
+    std::string line;
+    while (std::getline(is, line)) {
+        std::cout << line << std::endl;
+        std::stringstream ss(line);
+        GemmDesc          g{};
+        import_impl(ss,
+                    g.arch,
+                    g.type_a,
+                    g.type_b,
+                    g.type_c,
+                    g.order_a,
+                    g.order_b,
+                    g.order_c,
+                    g.pack_a,
+                    g.pack_b,
+                    g.pack_u,
+                    g.pack_v,
+                    g.quant_a.type,
+                    g.quant_a.group_size,
+                    g.quant_b.type,
+                    g.quant_b.group_size,
+                    g.epilogue,
+                    g.m,
+                    g.n,
+                    g.k,
+                    g.batch_dim);
+        KernelDesc k{};
+        k.type_a  = g.type_a;
+        k.type_b  = g.type_b;
+        k.type_c  = g.type_c;
+        k.pack_a  = g.pack_a;
+        k.pack_b  = g.pack_b;
+        k.pack_u  = g.pack_u;
+        k.pack_v  = g.pack_v;
+        k.order_a = g.order_a;
+        k.order_b = g.order_b;
+        k.order_c = g.order_c;
+        k.quant_a = g.quant_a;
+        k.quant_b = g.quant_b;
+        import_impl(ss,
+                    k.arch,
+                    k.op_class,
+                    k.cta_tile.x,
+                    k.cta_tile.y,
+                    k.cta_tile.z,
+                    k.mma_tile.x,
+                    k.mma_tile.y,
+                    k.mma_tile.z,
+                    k.stages,
+                    k.align.x,
+                    k.align.y,
+                    k.align.z,
+                    k.policy_a,
+                    k.policy_b,
+                    k.c_tile.x,
+                    k.c_tile.y,
+                    k.split_k);
+        LaunchSpec spec{};
+        import_impl(ss, spec.swizzle, spec.splits);
+        for (const auto& p : kernels) {
+            if (p->desc() == k) {
+                spec.kernel = p;
+                break;
+            }
+        }
+        if (spec.kernel) {
+            entries.emplace_back(g, spec);
+        }
+        else {
+            std::cerr << "No kernel found for entry: " << line << "\n";
+        }
+    }
+}
+
+namespace {
+
+inline decltype(auto) as_tuple(const GemmDesc& d)
+{
+    return std::tie(d.arch,
+                    d.type_a,
+                    d.type_b,
+                    d.type_c,
+                    d.order_a,
+                    d.order_b,
+                    d.order_c,
+                    d.pack_a,
+                    d.pack_b,
+                    d.pack_u,
+                    d.pack_v,
+                    d.quant_a.type,
+                    d.quant_a.group_size,
+                    d.quant_b.type,
+                    d.quant_b.group_size,
+                    d.m,
+                    d.n,
+                    d.k,
+                    d.batch_dim);
+    // d.epilogue
+}
+
+}  // namespace
+
+inline bool operator<(const GemmDesc& a, const GemmDesc& b)
+{
+    return as_tuple(a) < as_tuple(b);
+}
+
+int extract_batch_size(GemmDesc& desc)
+{
+    return std::exchange(desc.batch_dim == 0 ? desc.m : desc.n, 0);
+}
+
+void set_batch_size(GemmDesc& desc, int batch_size)
+{
+    (desc.batch_dim == 0 ? desc.m : desc.n) = batch_size;
+}
+
+struct DispatchCache::Impl {
+
+    struct Flat {
+        std::vector<std::pair<int, int>> idxs;
+        std::vector<LaunchSpec>          specs;
+    };
+
+    const std::vector<Kernel*> kernels_;
+    std::map<GemmDesc, Flat>   cache_;
+
+    Impl(std::vector<Kernel*> kernels): kernels_(std::move(kernels)) {}
+
+    std::optional<LaunchSpec> Find(GemmDesc desc, bool exact) const
+    {
+        const int batch_size = extract_batch_size(desc);
+        // std::cerr << batch_size << " " << desc.m << " " << desc.n << " " << desc.k << "\n";
+        const auto it = cache_.find(desc);
+        if (it != cache_.end()) {
+            const auto& [idxs, specs] = it->second;
+            // Find index via key
+            const auto p =
+                std::lower_bound(idxs.begin(), idxs.end(), std::make_pair(batch_size, 0), [](auto& a, auto& b) {  //
+                    return a.first < b.first;
+                });
+            // std::cerr << p->first << " " << p->second << "\n";
+            if (p != idxs.end() && (!exact || p->first == batch_size)) {
+                return specs[p->second];
+            }
+        }
+        return {};
+    }
+
+    bool Insert(GemmDesc desc, const LaunchSpec& spec)
+    {
+        const int batch_size = extract_batch_size(desc);
+
+        auto it = cache_.find(desc);
+        if (it == cache_.end()) {
+            it = cache_.emplace_hint(it, desc, Flat{});
+        }
+        auto& [idxs, specs] = it->second;
+        // Find index via key
+        const auto p =
+            std::lower_bound(idxs.begin(), idxs.end(), std::make_pair(batch_size, 0), [](auto& a, auto& b) {  //
+                return a.first < b.first;
+            });
+        // Exact match, skip
+        if (p != idxs.end() && p->first == batch_size) {
+            return false;
+        }
+        // Insert
+        idxs.insert(p, {batch_size, (int)specs.size()});
+        specs.push_back(spec);
+        return true;
+    }
+
+    int Export(std::ostream& os) const
+    {
+        std::vector<std::pair<GemmDesc, LaunchSpec>> entries;
+        for (const auto& [desc, flat] : cache_) {
+            auto tmp = desc;
+            for (const auto& [batch_size, index] : flat.idxs) {
+                set_batch_size(tmp, batch_size);
+                entries.emplace_back(tmp, flat.specs[index]);
+            }
+        }
+        Summary(entries);
+        ExportDispatchCache(os, entries);
+        return entries.size();
+    }
+
+    int Import(std::istream& is)
+    {
+        std::vector<std::pair<GemmDesc, LaunchSpec>> entries;
+        ImportDispatchCache(is, entries, kernels_);
+        Summary(entries);
+        for (auto [desc, spec] : entries) {
+            const int batch_size = extract_batch_size(desc);
+            auto      it         = cache_.find(desc);
+            if (it == cache_.end()) {
+                it = cache_.emplace_hint(it, desc, Flat{});
+            }
+            auto& [idxs, specs] = it->second;
+            // Order is not maintained at this point
+            idxs.emplace_back(batch_size, (int)specs.size());
+            specs.push_back(spec);
+        }
+        // Sort indices and deduplicate
+        for (auto& [desc, flat] : cache_) {
+            auto& [idxs, specs] = flat;
+            const auto cmp      = [](auto& a, auto& b) {  //
+                return a.first < b.first;
+            };
+            std::stable_sort(idxs.begin(), idxs.end(), cmp);
+            idxs.erase(std::unique(idxs.begin(), idxs.end(), cmp), idxs.end());
+            // Remove unreferenced specs and update spec indices
+            std::vector<LaunchSpec> tmp;
+            for (auto& [key, val] : idxs) {
+                int old = std::exchange(val, tmp.size());
+                tmp.push_back(specs[old]);
+            }
+            specs = std::move(tmp);
+        }
+        return entries.size();
+    }
+
+    // Print a summary of how many cases a kernel is used
+    void Summary(const std::vector<std::pair<GemmDesc, LaunchSpec>>& entries) const
+    {
+        std::vector<Kernel*> uses{nullptr};
+        std::copy(kernels_.begin(), kernels_.end(), std::back_inserter(uses));
+
+        for (const auto& [_, s] : entries) {
+            uses.push_back(s.kernel);
+        }
+        std::sort(uses.begin(), uses.end());
+        std::vector<std::pair<int, Kernel*>> count;
+        for (size_t i = 1; i < uses.size(); ++i) {
+            if (uses[i] != uses[i - 1]) {
+                count.emplace_back(-1, uses[i]);
+            }
+            ++count.back().first;
+        }
+        std::sort(count.begin(), count.end(), std::greater<>{});
+        for (const auto& [n, k] : count) {
+            std::cout << k->name() << ": " << n << "\n";
+        }
+    }
+};
+
+DispatchCache::DispatchCache(std::vector<Kernel*> kernels): impl_(std::make_unique<Impl>(std::move(kernels))) {}
+
+DispatchCache::~DispatchCache() = default;
+
+std::optional<LaunchSpec> DispatchCache::Find(const GemmDesc& desc) const
+{
+    return impl_->Find(desc, true);
+}
+
+std::optional<LaunchSpec> DispatchCache::LowerBound(const GemmDesc& desc) const
+{
+    return impl_->Find(desc, false);
+}
+
+bool DispatchCache::Insert(const GemmDesc& desc, const LaunchSpec& spec)
+{
+    return impl_->Insert(desc, spec);
+}
+
+int DispatchCache::Export(std::ostream& os) const
+{
+    return impl_->Export(os);
+}
+
+int DispatchCache::Import(std::istream& is)
+{
+    return impl_->Import(is);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/dispatch_cache.h b/src/turbomind/kernels/gemm/dispatch_cache.h
new file mode 100644
index 0000000000..0542f3a541
--- /dev/null
+++ b/src/turbomind/kernels/gemm/dispatch_cache.h
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/desc.h"
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+namespace turbomind::gemm {
+
+class DispatchCache {
+public:
+    DispatchCache(std::vector<Kernel*> kernels);
+
+    ~DispatchCache();
+
+    std::optional<LaunchSpec> LowerBound(const GemmDesc& desc) const;
+
+    std::optional<LaunchSpec> Find(const GemmDesc& desc) const;
+
+    bool Insert(const GemmDesc& desc, const LaunchSpec& spec);
+
+    int Export(std::ostream& os) const;
+
+    int Import(std::istream& is);
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/epilogue.h b/src/turbomind/kernels/gemm/epilogue.h
new file mode 100644
index 0000000000..c2c5ff54ce
--- /dev/null
+++ b/src/turbomind/kernels/gemm/epilogue.h
@@ -0,0 +1,465 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/core/sync.h"
+#include "src/turbomind/kernels/gemm/iterator_sm80.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+template<class Tc>
+struct ChannelCombination_v2 {
+    const Tc* __restrict__ scale_ptr;
+    const Tc* __restrict__ bias_ptr;
+
+    template<class T, int V, int S, int C>
+    __device__ void operator()(Array<T, V> (&x)[S][C], int2 cta_cs, int2 thr_cs, int2 delta_cs, int2 end_cs) const
+    {
+        // T scale[S];
+
+        Array<T, S> scale;
+        fill(scale, T(1));
+
+        if (scale_ptr) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                const int ss = thr_cs.y + s * delta_cs.y;
+                if (ss < end_cs.y) {
+                    scale[s] = static_cast<T>(__ldg(scale_ptr + ss + cta_cs.y));
+                }
+            }
+        }
+
+        T bias[S]{};
+
+        if (bias_ptr) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                const int ss = thr_cs.y + s * delta_cs.y;
+                if (ss < end_cs.y) {
+                    bias[s] = static_cast<T>(__ldg(bias_ptr + ss + cta_cs.y));
+                }
+            }
+        }
+
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < C; ++c) {
+                using namespace ops;
+                x[s][c] = x[s][c] * scale[s] + bias[s];
+            }
+        }
+    }
+};
+
+template<class Tc>
+struct ChannelCombination_v3 {
+    const Tc* __restrict__ scale_bias_ptr;
+
+    template<class T, int V, int S, int C, int delta_c, int delta_s, class Pred>
+    __device__ void operator()(Array<T, V> (&x)[S][C], int2 cs0, pair<delta_c, delta_s>, Pred& pred) const
+    {
+        __align__(16) Array<Tc, 2> scale_bias[S];
+
+        if (scale_bias_ptr) {
+            constexpr int ds  = sizeof(Tc) * delta_s;
+            auto          ptr = reinterpret_cast<const char*>(scale_bias_ptr + cs0.y);
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                if (pred(s, 0)) {
+                    Ldg(scale_bias[s], reinterpret_cast<const Tc*>(ptr));
+                }
+                ptr += ds;
+            }
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                auto tmp = cast<T>(scale_bias[s]);
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    using namespace ops;
+                    x[s][c] = x[s][c] * tmp[0] + tmp[1];
+                }
+            }
+        }
+    }
+};
+
+template<class Tc>
+struct MatrixCombination_v2 {
+    float alpha;
+    float beta;
+
+    const Tc* C_ptr;  // can't `__restrict__` since it may be alias of `D`
+    int64_t   ldc;
+
+    template<class T, int N, int S, int C, int delta_c, int delta_s, class Pred>
+    __device__ void operator()(Array<T, N> (&x)[S][C], int2 cs0, pair<delta_c, delta_s>, Pred& pred) const
+    {
+        Array<Tc, N> frag[S][C]{};
+        if (beta) {
+            constexpr int dc  = sizeof(Tc) * delta_c;
+            const int     ds  = sizeof(Tc) * delta_s * ldc;
+            auto          ptr = reinterpret_cast<const char*>(C_ptr + cs2idx(cs0, (int64_t)ldc));
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    if (pred(s, c)) {
+                        Load(frag[s][c], reinterpret_cast<const Tc*>(ptr));
+                    }
+                    ptr += dc;
+                }
+                ptr -= dc * C;
+                ptr += ds;
+            }
+        }
+
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < C; ++c) {
+                using namespace ops;
+                x[s][c] = x[s][c] * alpha + cast<T>(frag[s][c]) * beta;
+            }
+        }
+    }
+};
+
+template<class Tc>
+struct MatrixCombination_v3 {
+    float alpha;
+    float beta;
+
+    const Tc* C_ptr;  // can't `__restrict__` since it may be alias of `D`
+    int64_t   ldc;
+
+    template<class T, int N, int S, int C, int delta_c, int delta_s, class Pred>
+    __device__ void operator()(Array<T, N> (&x)[S][C], int2 cs0, pair<delta_c, delta_s>, Pred& pred) const
+    {
+
+        if (beta) {
+            Array<Tc, N>  frag[S][C];
+            constexpr int dc  = sizeof(Tc) * delta_c;
+            const int     ds  = sizeof(Tc) * delta_s * ldc;
+            auto          ptr = reinterpret_cast<const char*>(C_ptr + cs2idx(cs0, (int64_t)ldc));
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    if (pred(s, c)) {
+                        Load(frag[s][c], reinterpret_cast<const Tc*>(ptr));
+                        using namespace ops;
+                        x[s][c] = x[s][c] * alpha + cast<T>(frag[s][c]) * beta;
+                    }
+                    ptr += dc;
+                }
+                ptr -= dc * C;
+                ptr += ds;
+            }
+        }
+        else if (alpha != 1.f) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    using namespace ops;
+                    x[s][c] = x[s][c] * alpha;
+                }
+            }
+        }
+    }
+};
+
+template<class Act>
+struct GatedActivation {
+    template<class T, int N>
+    __device__ static void apply(Array<T, N>& x)
+    {
+        static_assert(N % 2 == 0);
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 2) {
+            x[i / 2] = static_cast<T>(Act::apply(x[i]) * x[i + 1]);
+        }
+    }
+};
+
+struct Silu {
+    __device__ static float apply(float x)
+    {
+        return fdividef(x, 1.f + expf(-x));
+    }
+};
+
+template<class Tc>
+struct EpilogueParam {
+    int m;
+    int n;
+    Tc* C;
+    int ldc;
+
+    float* partial_C;
+    int    partial_C_ld;
+
+    int* locks;  // (m/cta_m, n/cta_n, k)
+
+    // ChannelCombination_v3<Tc> combine_chn;
+    MatrixCombination_v3<Tc> combine_mat;
+    bool                     silu_act;
+};
+
+template<class Tc_, int M, int N, int TM_, int TN_, int THREADS, class RearrangeC, class OperandC, bool SplitK_>
+struct Epilogue_ {
+
+    using Dtype = typename OperandC::Dtype;
+
+    static constexpr auto kOrder = OperandC::kOrder;
+    static constexpr auto SplitK = SplitK_;
+
+    using Tc = Tc_;
+
+    static constexpr int TM = TM_;
+    static constexpr int TN = TN_;
+
+    using SmemLayout = decltype(OperandC::GetSmemLayout::apply(pair<TM, TN>{}));
+
+    using SmemAccessorV2 = SmemAccessorV2<Dtype, SmemLayout, kOrder>;
+
+    using SharedStorage = Array<Dtype, SmemLayout::kSize>;
+
+    using Map = decltype(OperandC::GetThreadMap::apply(pair<M, N>{}, constant<THREADS>{}));
+
+    static constexpr int S       = Map::kIterS;
+    static constexpr int C       = Map::kIterC;
+    static constexpr int kAccess = Map::kAccessC;
+
+    template<class T>
+    using OutputC = Array<T, kAccess>;
+
+    template<class FragC>
+    __device__ void Rearrange(FragC& frag_C, SharedStorage& storage, OutputC<Dtype> (&out)[S][C])
+    {
+        SmemAccessorV2 smem_C{storage.data()};
+
+        const int2 thr_cs = Map::get_offset(threadIdx.x / WARP_SIZE, threadIdx.x % WARP_SIZE);
+
+        constexpr int kPeriodC = ceil_div(SmemLayout::C0, Map::kDeltaC);
+        constexpr int kPeriodS = ceil_div(SmemLayout::S0, Map::kDeltaS);
+
+        int phases[kPeriodS][kPeriodC];
+        PRAGMA_UNROLL
+        for (int s = 0; s < kPeriodS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < kPeriodC; ++c) {
+                phases[s][c] = SmemLayout::apply(s * Map::kDeltaS + thr_cs.y, c * Map::kDeltaC + thr_cs.x);
+            }
+        }
+
+        constexpr bool kRaked = true;
+
+        PRAGMA_UNROLL
+        for (int m = 0; m < M; m += TM) {
+            PRAGMA_UNROLL
+            for (int n = 0; n < N; n += TN) {
+                // Store to shared memory
+                RearrangeC::apply(frag_C, smem_C, {m, n}, pair<TM, TN>{});
+
+                // Load from shared memory
+                PRAGMA_UNROLL
+                for (int s = 0; s < S; ++s) {
+                    PRAGMA_UNROLL
+                    for (int c = 0; c < C; ++c) {
+                        const int cc = c * Map::kDeltaC + thr_cs.x;
+                        const int ss = s * Map::kDeltaS + thr_cs.y;
+
+                        const int2 mn =
+                            kRaked ? cs2mk<kOrder>(c * Map::kDeltaC, s * Map::kDeltaS) : cs2mk<kOrder>(cc, ss);
+                        const int  mm   = mn.x - m;
+                        const int  nn   = mn.y - n;
+                        const bool mask = (M <= TM || (0 <= mm && mm < TM)) && ((N <= TN) || (0 <= nn && nn < TN));
+
+                        const int2 _cs      = mk2cs<kOrder>(m, n);
+                        const int  offset_0 = SmemLayout::apply(  //
+                            s / kPeriodS * kPeriodS * Map::kDeltaS - _cs.y,
+                            c / kPeriodC * kPeriodC * Map::kDeltaC - _cs.x);
+                        const int  offset_p = phases[s % kPeriodS][c % kPeriodC];
+
+                        if (mask) {
+                            Load(out[s][c], &storage[offset_0 + offset_p]);
+                        }
+                    }
+                }
+                __syncthreads();
+            }
+        }
+    }
+
+    template<class VecC, class T, class Pred>
+    __device__ void StoreC(const VecC& vec_C, T* data_C, int ldc, int2 cs0, Pred& pred)
+    {
+        constexpr int dc  = sizeof(T) * Map::kDeltaC;
+        const int     ds  = sizeof(T) * Map::kDeltaS * ldc;
+        auto          ptr = reinterpret_cast<char*>(data_C + cs2idx(cs0, (int64_t)ldc));
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < C; ++c) {
+                const auto tmp = cast<T>(vec_C[s][c]);
+                if (pred(s, c)) {
+                    Store(reinterpret_cast<T*>(ptr), tmp);
+                }
+                ptr += dc;
+            }
+            ptr -= dc * C;
+            ptr += ds;
+        }
+    }
+
+    template<class FragC, class Pred>
+    __device__ void Reduce(
+        FragC& frag_C, int splits, int64_t split_size, const int2& cta_cs, Pred& pred, const EpilogueParam<Tc>& param)
+    {
+        using Vec         = OutputC<Dtype>;
+        const int2 thr_cs = Map::get_offset(threadIdx.x / WARP_SIZE, threadIdx.x % WARP_SIZE);
+        for (int k = 0; k < splits; ++k) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    const int     ss  = thr_cs.y + s * Map::kDeltaS;
+                    const int     cc  = thr_cs.x + c * Map::kDeltaC;
+                    const int64_t idx = k * split_size + (cta_cs.y + ss) * param.partial_C_ld + (cta_cs.x + cc);
+                    if (true) {
+                        Vec tmp;
+                        Load(tmp, &param.partial_C[idx]);
+                        using namespace ops;
+                        frag_C[s][c] = frag_C[s][c] + tmp;
+                    }
+                }
+            }
+        }
+    }
+
+    template<class FragC, class Pred>
+    __device__ void
+    Reduce_v2(FragC& frag_C, int split_id, bool is_last, int2 cs0, Pred& pred, const EpilogueParam<Tc>& param)
+    {
+        constexpr int dc = sizeof(Dtype) * Map::kDeltaC;
+        const int     ds = sizeof(Dtype) * Map::kDeltaS * param.partial_C_ld;
+
+        const auto ptr0 = reinterpret_cast<char*>(param.partial_C + cs2idx(cs0, (int64_t)param.partial_C_ld));
+
+        Pred ld_mask = split_id == 0 ? Pred{} : pred;
+        Pred st_mask = is_last ? Pred{} : pred;
+
+        auto ptr = ptr0;
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < C; ++c) {
+                OutputC<Dtype> tmp{};  // ! ZERO-filled
+                if (ld_mask(s, c)) {
+                    Load(tmp, reinterpret_cast<Dtype*>(ptr));
+                }
+                if (1) {
+                    using namespace ops;
+                    frag_C[s][c] = frag_C[s][c] + tmp;
+                }
+                if (st_mask(s, c)) {
+                    Store(reinterpret_cast<Dtype*>(ptr), frag_C[s][c]);
+                }
+                ptr += dc;
+            }
+            ptr -= dc * C;
+            ptr += ds;
+        }
+    }
+
+    template<class FragC>
+    __device__ void operator()(FragC&                   frag_C,
+                               const int3&              tile_offset,
+                               const int3&              tiled_shape,
+                               int                      end_m,
+                               int                      end_n,
+                               bool                     is_last_split,
+                               const EpilogueParam<Tc>& param,
+                               SharedStorage&           storage)
+    {
+        const int2 cta_cs = mk2cs<kOrder>(tile_offset.x * M, tile_offset.y * N);
+        const int2 end_cs = mk2cs<kOrder>(end_m, end_n);
+
+        OutputC<Dtype> tmp_C[S][C];
+
+        Rearrange(frag_C, storage, tmp_C);
+
+        Predicate<S, C, false, false> pred{};  //  1 regs
+
+        const int2 thr_cs = Map::get_offset(threadIdx.x / WARP_SIZE, threadIdx.x % WARP_SIZE);
+        const int2 cs0    = {cta_cs.x + thr_cs.x, cta_cs.y + thr_cs.y};
+
+        PRAGMA_UNROLL
+        for (int s = 0; s < S; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < C; ++c) {
+                const int ss = thr_cs.y + s * Map::kDeltaS;
+                const int cc = thr_cs.x + c * Map::kDeltaC;
+                if (ss < end_cs.y && cc < end_cs.x) {
+                    pred.set(s, c);
+                }
+            }
+        }
+
+        if (SplitK_ && tiled_shape.z > 1) {
+            int* barrier = &param.locks[tile_offset.x * tiled_shape.y + tile_offset.y];
+
+            sem_wait(barrier, tile_offset.z, threadIdx.x == 0);
+
+            Reduce_v2(tmp_C, tile_offset.z, is_last_split, cs0, pred, param);
+
+            const int post_id = is_last_split ? 0 : tile_offset.z + 1;
+            sem_post(barrier, post_id, threadIdx.x == 0);
+
+            if (!is_last_split) {
+                return;
+            }
+        }
+
+        constexpr pair<Map::kDeltaC, Map::kDeltaS> delta_cs{};
+
+        // param.combine_chn(tmp_C, cs0, delta_cs, pred);
+
+        param.combine_mat(tmp_C, cs0, delta_cs, pred);
+
+        if (param.silu_act) {
+            constexpr int dc  = sizeof(Tc) * Map::kDeltaC / 2;
+            const int     ds  = sizeof(Tc) * Map::kDeltaS * param.ldc;
+            auto          ptr = reinterpret_cast<char*>(param.C + cs2idx({cs0.x / 2, cs0.y}, (int64_t)param.ldc));
+            PRAGMA_UNROLL
+            for (int s = 0; s < S; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    GatedActivation<Silu>::apply(tmp_C[s][c]);
+                    if (pred(s, c)) {
+                        const auto tmp = cast<Tc>((Array<Dtype, kAccess / 2>&)tmp_C[s][c]);
+                        Store(reinterpret_cast<Tc*>(ptr), tmp);
+                    }
+                    ptr += dc;
+                }
+                ptr -= dc * C;
+                ptr += ds;
+            }
+        }
+        else {
+            StoreC(tmp_C, param.C, param.ldc, cs0, pred);
+        }
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/format.h b/src/turbomind/kernels/gemm/format.h
new file mode 100644
index 0000000000..f407edffa4
--- /dev/null
+++ b/src/turbomind/kernels/gemm/format.h
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+
+namespace turbomind::gemm {
+
+template<class Tin, class Tout>
+struct Converter {
+};
+
+template<class T>
+struct Converter<T, T> {
+    template<int N>
+    __device__ Array<T, N> operator()(Array<T, N> x)
+    {
+        return x;
+    }
+};
+
+template<>
+struct Converter<uint16_t, uint4_t> {
+
+    static __device__ Array<uint4_t, 8> pack(const Array<uint8_t, 8>& vi)
+    {
+        Array<uint32_t, 2> ui = (Array<uint32_t, 2>&)vi;
+
+        ui[0] |= (ui[0] >> 12);
+        ui[1] |= (ui[1] >> 12);
+
+        //  7 6 5 4 3 2 1 0
+        // _7_67564_3_23120
+        uint32_t uo = __byte_perm(ui[0], ui[1], 0x5140);
+
+        return (Array<uint4_t, 8>&)uo;
+    }
+
+    template<class U, int N>
+    __device__ Array<uint4_t, N> operator()(const Array<U, N>& x)
+    {
+        static_assert(sizeof(U) == 2);
+        auto&             vi = (const Array<uint16_t, N>&)x;
+        Array<uint8_t, N> tmp;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            tmp[i] = static_cast<uint8_t>(vi[i]);
+        }
+        Array<uint4_t, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; i += 8) {
+            (Array<uint4_t, 8>&)vo[i] = pack((Array<uint8_t, 8>&)tmp[i]);
+        }
+        return vo;
+    }
+};
+
+template<>
+struct Converter<uint16_t, uint8_t> {
+    template<int N>
+    __device__ Array<uint8_t, N> operator()(const Array<uint16_t, N>& x)
+    {
+        // static_assert(sizeof(U) == 2);
+        auto&             vi = (const Array<uint16_t, N>&)x;
+        Array<uint8_t, N> vo;
+        PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            vo[i] = static_cast<uint8_t>(vi[i]);
+        }
+        return vo;
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gemm.cu b/src/turbomind/kernels/gemm/gemm.cu
new file mode 100644
index 0000000000..c6d3739a1e
--- /dev/null
+++ b/src/turbomind/kernels/gemm/gemm.cu
@@ -0,0 +1,402 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/dispatch_cache.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/tuner/params.h"
+#include "src/turbomind/kernels/gemm/tuner/sampler.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <vector>
+
+namespace turbomind::gemm {
+
+void ExportDispatchCache(std::ostream& os, const std::vector<std::pair<GemmDesc, LaunchSpec>>& entries);
+
+void ImportDispatchCache(std::istream&                                 is,
+                         std::vector<std::pair<GemmDesc, LaunchSpec>>& entries,
+                         const std::vector<std::unique_ptr<Kernel>>&   kernels);
+
+namespace {
+
+template<class Cmp>
+std::vector<int> ArgSort(size_t size, const Cmp& cmp)
+{
+    std::vector<int> idxs(size);
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::stable_sort(idxs.begin(), idxs.end(), cmp);
+    return idxs;
+}
+
+inline int get_batch_dim(const GemmDesc& desc)
+{
+    return desc.batch_dim == 0 ? desc.m : desc.n;
+}
+
+inline int get_batch_dim(const KernelDesc& desc, int batch_dim)
+{
+    return batch_dim == 0 ? desc.cta_tile.x : desc.cta_tile.y;
+}
+
+}  // namespace
+
+struct Gemm::Impl {
+
+    Impl():
+        props_{GetCudaDeviceProps()},
+        arch_{props_->major * 100 + props_->minor * 10},
+        registry_{props_},
+        cache_{registry_.kernels()}
+    {
+        if (auto str = std::getenv("TM_GEMM_TUNE")) {
+            try {
+                ParseTuningParams(tuning_, str);
+            }
+            catch (...) {
+                std::cerr << "[Gemm2] Failed to parse `TM_GEMM_TUNE`, default value will be used.\n";
+                tuning_ = {};
+            }
+        }
+        measurer_.emplace(CreateStoppingCriterion(tuning_.min_iter, tuning_.max_iter, tuning_.max_time));
+    }
+
+    // find launch spec in dispatch cache, dispatch by heuristic on cache miss
+    LaunchSpec Dispatch(DispatchPolicy policy, GemmDesc desc, size_t barriers_size, size_t partials_size)
+    {
+        if (policy & DispatchPolicy::kReuse) {
+            if (auto spec = cache_.LowerBound(desc)) {
+                return *spec;
+            }
+            std::cerr << "Failed to find a feasible kernel in the cache, will dispatch by heuristic.\n";
+        }
+
+        if (auto spec = cache_.Find(desc)) {
+            return *spec;
+        }
+
+        auto specs = Find(desc, barriers_size, partials_size, 1);
+        if (!specs.empty()) {
+            cache_.Insert(desc, specs.front());
+            return specs.front();
+        }
+        return {};
+    }
+
+    std::vector<LaunchSpec> Find(const GemmDesc& desc, size_t barrier_size, size_t partials_size, int top_k)
+    {
+        std::vector<Kernel*> feasible;
+        std::copy_if(registry_.kernels().begin(), registry_.kernels().end(), std::back_inserter(feasible), [&](auto p) {
+            return p->is_feasible(desc);
+        });
+        if (feasible.empty()) {
+            return {};
+        }
+
+        if (1) {
+            int max_batch_size = 0;
+            for (const auto& k : feasible) {
+                max_batch_size = std::max(get_batch_dim(k->desc(), desc.batch_dim), max_batch_size);
+            }
+            const int batch_size = get_batch_dim(desc);
+            for (const auto& k : feasible) {
+                const auto x = get_batch_dim(k->desc(), desc.batch_dim);
+                if (x >= batch_size) {
+                    max_batch_size = std::min(max_batch_size, x);
+                }
+            }
+            auto pred = [&](auto k) { return get_batch_dim(k->desc(), desc.batch_dim) > max_batch_size; };
+            feasible.erase(std::remove_if(feasible.begin(), feasible.end(), pred), feasible.end());
+        }
+
+        std::vector<std::vector<LaunchSpec>> clusters;
+        {
+            std::vector<LaunchSpec> tmp;
+            tmp.reserve(feasible.size());
+            for (const auto& k : feasible) {
+                LaunchSpec spec{k};
+                tmp.push_back(spec);
+            }
+            clusters = Cluster(tmp, ClusteringParam{false, true});
+        }
+        std::vector<Kernel*> proxies;
+        proxies.reserve(clusters.size());
+
+        for (const auto& c : clusters) {
+            proxies.push_back(c.front().kernel);
+        }
+
+        //             cluster_id, splits, metrics
+        std::vector<std::tuple<int, int, KernelMetric>> metrics;
+
+        for (int cluster_id = 0; cluster_id < (int)proxies.size(); ++cluster_id) {
+            auto&     kernel     = *proxies[cluster_id];
+            const int max_splits = kernel.GetMaxSplits(desc.m, desc.n, desc.k, barrier_size, partials_size);
+
+            auto ms = kernel.Estimate_v2({desc.m, desc.n, desc.k},  //
+                                         std::min(max_splits, tuning_.max_splits),
+                                         tuning_.max_waves,
+                                         props_->multiProcessorCount);
+
+            for (const auto& [splits, metric] : ms) {
+                metrics.emplace_back(cluster_id, splits, metric);
+            }
+        }
+
+        // std::cerr << "#kernel: " << kernels.size() << ", #cluster: " << clusters.size()
+        //           << ", #metric: " << metrics.size() << "\n";
+
+        std::vector<int64_t> mio_cost;
+        std::vector<int64_t> mma_cost;
+        for (const auto& [_, s, m] : metrics) {
+            mio_cost.push_back(m.mio_cost);
+            mma_cost.push_back(m.mma_cost);
+        }
+
+        const auto mio_max = *std::max_element(mio_cost.begin(), mio_cost.end());
+        const auto mma_max = *std::max_element(mma_cost.begin(), mma_cost.end());
+
+        std::vector<float> mio_ratio;
+        std::vector<float> mma_ratio;
+        std::vector<float> avg_ratio;
+        for (size_t i = 0; i < metrics.size(); ++i) {
+            mio_ratio.push_back(static_cast<float>(mio_cost[i]) / mio_max);
+            mma_ratio.push_back(static_cast<float>(mma_cost[i]) / mma_max);
+            avg_ratio.push_back(.5f * (mio_ratio.back() + mma_ratio.back()));
+        }
+
+        auto idxs = ArgSort(metrics.size(), [&](int i, int j) {  //
+            return avg_ratio[i] < avg_ratio[j];
+        });
+
+        // for (const auto& i : idxs) {
+        //     auto [cid, s, m] = metrics[i];
+        //     std::cout << clusters[cid].front().kernel->name() << " s" << s << " " << avg_ratio[i] << " " <<
+        //     mio_ratio[i]
+        //               << " " << mma_ratio[i] << " " << m.mio_cost << " " << m.mma_cost << "\n";
+        // }
+
+        top_k = top_k > 0 ? std::min<int>(idxs.size(), top_k) : (int)idxs.size();
+        std::vector<LaunchSpec> ret;
+        ret.reserve(top_k);
+        for (int i = 0; i < top_k; ++i) {
+            const auto& [cluster_id, splits, cost] = metrics[idxs[i]];
+            // Apply `splits` to all kernels in the cluster
+            for (const auto& s : clusters[cluster_id]) {
+                ret.push_back(LaunchSpec{s.kernel, tuning_.swizzle.at(0), splits});
+            }
+        }
+
+        return ret;
+    }
+
+    template<class LaunchFunc>
+    int Measure(const GemmDesc& desc,
+                size_t          barriers_size,
+                size_t          partials_size,
+                int             top_k,
+                LaunchFunc      launch_func,
+                cudaStream_t    st)
+    {
+        // Early exit on exact match
+        if (cache_.Find(desc)) {
+            return 0;
+        }
+        // std::cerr << "GEMM: " << desc.m << "x" << desc.n << "x" << desc.k << "\n";
+
+        const auto tmp = Find(desc, barriers_size, partials_size, tuning_.top_k);
+
+        std::vector<LaunchSpec> specs;
+        for (const auto& spec : tmp) {
+            // populate swizzle parameters
+            const auto swis = FilterSwizzleParam(*spec.kernel, desc.m, desc.n, desc.k, tuning_.swizzle);
+            for (const auto& swi : swis) {
+                specs.push_back(spec);
+                specs.back().swizzle = swi;
+            }
+        }
+
+        specs = Sampler{*measurer_, tuning_.clusters}.Run(specs, launch_func, st);
+
+        // for (const auto& s : specs) {
+        //     std::cout << s.kernel->name()          //
+        //               << " swizzle=" << s.swizzle  //
+        //               << ", splits=" << s.splits   //
+        //               << ", measured=" << s.measured << "ms\n";
+        // }
+
+        if (!specs.empty()) {
+            cache_.Insert(desc, specs.front());
+        }
+        else {
+            std::cerr << "No valid kernel found for the problem\n";
+            return -1;
+        }
+
+        return 0;
+    }
+
+    std::vector<int> FilterSwizzleParam(Kernel& kernel, int m, int n, int k, const std::vector<int>& swis)
+    {
+        std::vector<int> swizzles;
+        for (const auto& swi : swis) {
+            // To use splits=1 here, swizzling must not depends on split count
+            swizzles.push_back(kernel.GetSwizzle(m, n, k, 1, swi));
+        }
+        if (swizzles.size() == 1) {
+            return swizzles;
+        }
+
+        // De-duplicate possible swizzles while keep the order
+        std::sort(swizzles.begin(), swizzles.end());
+        swizzles.erase(std::unique(swizzles.begin(), swizzles.end()), swizzles.end());
+
+        std::vector<int> tmp;
+        std::copy_if(swis.begin(), swis.end(), std::back_inserter(tmp), [&](int swi) {
+            return std::find(swizzles.begin(), swizzles.end(), swi) != swizzles.end();
+        });
+        tmp.swap(swizzles);
+
+        return swizzles;
+    }
+
+    /// TODO: move to cuda utils
+    static std::unique_ptr<cudaDeviceProp> GetCudaDeviceProps()
+    {
+        auto props     = std::make_unique<cudaDeviceProp>();
+        int  device_id = -1;
+        cudaGetDevice(&device_id);
+        cudaGetDeviceProperties(props.get(), device_id);
+        return props;
+    }
+
+    std::shared_ptr<cudaDeviceProp> props_;
+
+    int arch_;
+
+    Registry registry_;
+
+    TuningParams tuning_;
+
+    std::optional<Measurer> measurer_;
+
+    DispatchCache cache_;
+};
+
+// implementation of GEMM interfaces
+
+Gemm::Gemm(): impl_{new Impl{}} {}
+
+Gemm::~Gemm() = default;
+
+int Gemm::Run(const Operation&    operation,
+              float               alpha,
+              const void*         A,
+              const MatrixLayout& Adesc,
+              const void*         U,
+              const MatrixLayout& Udesc,
+              const void*         B,
+              const MatrixLayout& Bdesc,
+              const void*         V,
+              const MatrixLayout& Vdesc,
+              float               beta,
+              const void*         C,
+              const MatrixLayout& Cdesc,
+              void*               D,
+              const MatrixLayout& Ddesc,
+              const Workspace&    workspace,
+              cudaStream_t        stream)
+{
+
+    if (Adesc.rows != Ddesc.rows || Bdesc.cols != Ddesc.cols || Adesc.cols != Bdesc.rows) {
+        return -1;
+    }
+
+    const int m = Ddesc.rows;
+    const int n = Ddesc.cols;
+    const int k = Adesc.cols;
+
+    const GemmDesc desc{
+        impl_->arch_,
+        Adesc.type,
+        Bdesc.type,
+        Ddesc.type,
+        Adesc.order,
+        Bdesc.order,
+        Ddesc.order,
+        Adesc.pack,
+        Bdesc.pack,
+        Udesc.pack,
+        Vdesc.pack,
+        operation.quant_a,
+        operation.quant_b,
+        operation.epilogue,
+        m,
+        n,
+        k,
+    };
+
+    const auto launch = [&](LaunchSpec spec, cudaStream_t st) {
+        auto _workspace = workspace;
+        return spec.kernel->Launch(operation,
+                                   alpha,
+                                   A,
+                                   Adesc,
+                                   U,
+                                   Udesc,
+                                   B,
+                                   Bdesc,
+                                   V,
+                                   Vdesc,
+                                   beta,
+                                   C,
+                                   Cdesc,
+                                   D,
+                                   Ddesc,
+                                   spec.swizzle,
+                                   spec.splits,
+                                   _workspace,
+                                   st);
+    };
+
+    LaunchSpec spec{};
+
+    if (operation.dispatch & DispatchPolicy::kMeasure) {
+        impl_->Measure(desc, workspace.barriers_size, workspace.partials_size, 1, launch, stream);
+    }
+
+    spec = impl_->Dispatch(operation.dispatch, desc, workspace.barriers_size, workspace.partials_size);
+
+    if (spec.kernel) {
+        // std::cout << "[Gemm] dispatch: " << spec.kernel->name()  //
+        //           << " split_k=" << spec.splits                  //
+        //           << " swizzle=" << spec.swizzle << std::endl;
+        return launch(spec, stream);
+    }
+
+    printf("No feasible kernel found for the problem.\n");
+
+    return -1;
+}
+
+int Gemm::Export(std::ostream& os)
+{
+    return impl_->cache_.Export(os);
+}
+
+int Gemm::Import(std::istream& is)
+{
+    return impl_->cache_.Import(is);
+}
+
+std::vector<int> Gemm::GetTuningSeq() const
+{
+    return impl_->tuning_.seq;
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gemm.h b/src/turbomind/kernels/gemm/gemm.h
new file mode 100644
index 0000000000..5a23486b1b
--- /dev/null
+++ b/src/turbomind/kernels/gemm/gemm.h
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/types.h"
+#include <cuda_runtime.h>
+#include <memory>
+#include <vector>
+
+namespace turbomind::gemm {
+
+class Gemm {
+public:
+    static constexpr size_t kBarriersSize = 1 << 20;
+    static constexpr size_t kPartialsSize = 32 << 20;
+
+    Gemm();
+
+    ~Gemm();
+
+    [[nodiscard]] int Run(const Operation&    operation,
+                          float               alpha,
+                          const void*         A,
+                          const MatrixLayout& Adesc,
+                          const void*         U,
+                          const MatrixLayout& Udesc,
+                          const void*         B,
+                          const MatrixLayout& Bdesc,
+                          const void*         V,
+                          const MatrixLayout& Vdesc,
+                          float               beta,
+                          const void*         C,
+                          const MatrixLayout& Cdesc,
+                          void*               D,
+                          const MatrixLayout& Ddesc,
+                          const Workspace&    workspace,
+                          cudaStream_t        stream);
+
+    [[maybe_unused]] int Export(std::ostream& os);
+
+    [[maybe_unused]] int Import(std::istream& is);
+
+    [[nodiscard]] std::vector<int> GetTuningSeq() const;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+[[nodiscard]] int
+Convert(const void* S, const MatrixLayout& Sdesc, void* D, const MatrixLayout& Ddesc, cudaStream_t stream);
+
+std::tuple<Order, Pack, Order, Pack> get_weight_and_scales_layout(int sm, bool force_simt);
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gemm_universal.h b/src/turbomind/kernels/gemm/gemm_universal.h
new file mode 100644
index 0000000000..9375dcb6fa
--- /dev/null
+++ b/src/turbomind/kernels/gemm/gemm_universal.h
@@ -0,0 +1,174 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/math.h"
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/epilogue.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+template<class PtrA, class PtrU, class PtrB, class PtrV, class Tc>
+struct GemmParams {
+    int m;
+    int n;
+    int k;
+
+    PtrA A;
+    int  lda;
+    PtrU U;
+    int  ldu;
+    PtrB B;
+    int  ldb;
+    PtrV V;
+    int  ldv;
+
+    int  log_tile;
+    int3 tiled_shape;
+
+    int chunk_per_split;
+    int chunk_offset;  // splits - chunk_cnt % splits
+
+    EpilogueParam<Tc> epilogue;
+};
+
+template<class Arch_, class Mainloop, class Epilogue_, class CtaMap_>
+struct GemmUniversal {
+
+    // using Impl = typename Mainloop::Impl;
+    using Impl = Mainloop;
+
+    using Ta = typename Impl::Ta;
+    using Tb = typename Impl::Tb;
+    using Tu = typename Impl::Tu;
+    using Tv = typename Impl::Tv;
+
+    using Epilogue = Epilogue_;
+
+    using Tc = typename Epilogue::Tc;
+
+    using Arch   = Arch_;
+    using CtaMap = CtaMap_;
+
+    // col major == M-major (A)
+    // row major == N-major (B)
+    static constexpr Order kOrderC = Epilogue::kOrder;
+
+    static constexpr int CTA_M = Impl::CTA_M;
+    static constexpr int CTA_N = Impl::CTA_N;
+    static constexpr int CTA_K = Impl::CTA_K;
+
+    static constexpr bool SplitK = Epilogue::SplitK;
+
+    using FragC = typename Impl::FragC;
+
+    static constexpr int WARP_CNT = Impl::WARPS;
+
+    using OperandA = typename Mainloop::OperandA;
+    using OperandB = typename Mainloop::OperandB;
+    using OperandU = typename Mainloop::OperandU;
+    using OperandV = typename Mainloop::OperandV;
+
+    static constexpr int kChunkSizeK = std::max(CTA_K, std::max(OperandU::kGroupSize, OperandV::kGroupSize));
+
+    static constexpr int kGroupSizeU = OperandU::kGroupSize;
+    static constexpr int kGroupSizeV = OperandV::kGroupSize;
+
+    union SharedStorage {
+        typename Mainloop::SharedStorage mainloop;
+        typename Epilogue::SharedStorage epilogue;
+    };
+
+    static constexpr Order kOrderA = OperandA::kOrder;
+    static constexpr Order kOrderB = OperandB::kOrder;
+    static constexpr Order kOrderU = OperandU::kOrder;
+    static constexpr Order kOrderV = OperandV::kOrder;
+
+    static constexpr Pack kPackA = OperandA::kPack;
+    static constexpr Pack kPackB = OperandB::kPack;
+
+    using PtrA = get_pointer_type<Ta>;
+    using PtrB = get_pointer_type<Tb>;
+    using PtrU = get_pointer_type<Tu>;
+    using PtrV = get_pointer_type<Tv>;
+
+    using Param = GemmParams<PtrA, PtrU, PtrB, PtrV, Tc>;
+
+    __device__ void operator()(const Param& param, const CtaMap& cta_map, char* smem_buf)
+    {
+        const auto tile_offset = CtaMap::get_tile_offset(param.log_tile);
+
+        const auto& tiled_shape = param.tiled_shape;
+
+        // Sub-optimal when the split is uneven
+        //   e.g. ceil_div(10, 3) = 4 -> [4, 4, 2], however [3, 3, 4] is better in every aspect
+        //   const int chunk_cnt = (param.k + kChunkSizeK - 1) / kChunkSizeK;
+        // const int chunk_per_split = (chunk_cnt + tiled_shape.z - 1) / tiled_shape.z;
+        // const int offset_k        = chunk_per_split * kChunkSizeK * tile_offset.z;
+        // const int gemm_k_size     = std::min(offset_k + chunk_per_split * kChunkSizeK, param.k) - offset_k;
+
+        int chunk_id    = tile_offset.z * param.chunk_per_split + max(tile_offset.z - param.chunk_offset, 0);
+        int offset_k    = chunk_id * kChunkSizeK;
+        int gemm_k_size = (param.chunk_per_split + int(tile_offset.z >= param.chunk_offset)) * kChunkSizeK;
+        gemm_k_size     = min(offset_k + gemm_k_size, param.k) - offset_k;
+
+        const int offset_m = tile_offset.x * CTA_M;
+        const int offset_n = tile_offset.y * CTA_N;
+
+        if (offset_m >= param.m || offset_n >= param.n || offset_k >= param.k) {  // empty tile
+            return;
+        }
+
+        const int end_m = min(CTA_M, param.m - offset_m);
+        const int end_n = min(CTA_N, param.n - offset_n);
+
+        SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+        // Is 8 enough?
+        __align__(8) FragC frag_C{};
+
+        int tile_iter = (gemm_k_size + CTA_K - 1) / CTA_K;
+
+        typename OperandA::GmemIter gmem_A{param.A, param.lda, {offset_m, offset_k}, {end_m, CTA_K}};
+        typename OperandB::GmemIter gmem_B{param.B, param.ldb, {offset_n, offset_k}, {end_n, CTA_K}};
+
+        /// TODO: move `ceil_div` into `GmemIter`
+        typename OperandU::GmemIter gmem_U{
+            param.U, param.ldu, {offset_m, ceil_div(offset_k, kGroupSizeU)}, {end_m, ceil_div(CTA_K, kGroupSizeU)}};
+        typename OperandV::GmemIter gmem_V{
+            param.V, param.ldv, {offset_n, ceil_div(offset_k, kGroupSizeV)}, {end_n, ceil_div(CTA_K, kGroupSizeV)}};
+
+        Mainloop mainloop{};
+
+        mainloop(gmem_A, gmem_B, gmem_U, gmem_V, frag_C, tile_iter, storage.mainloop);
+
+        Epilogue epilogue{};
+
+        const bool is_primary = offset_k + gemm_k_size == param.k;
+
+        epilogue(frag_C, tile_offset, tiled_shape, end_m, end_n, is_primary, param.epilogue, storage.epilogue);
+    }
+};
+
+extern __shared__ char smem_buf[];
+
+template<class Kernel, class Params, class CtaMap>
+__global__ void gemm_kernel(Params params, CtaMap cta_map)
+{
+#if __CUDA_ARCH__
+    if constexpr (Kernel::Arch::is_compatible(__CUDA_ARCH__)) {
+        Kernel kernel;
+        kernel(params, cta_map, smem_buf);
+    }
+#endif
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gpu_metric.cu b/src/turbomind/kernels/gemm/gpu_metric.cu
new file mode 100644
index 0000000000..76e3dbfd72
--- /dev/null
+++ b/src/turbomind/kernels/gemm/gpu_metric.cu
@@ -0,0 +1,163 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/gpu_metric.h"
+#include <thrust/device_vector.h>
+
+#include <cublas_v2.h>
+
+namespace turbomind::gemm {
+
+using thrust::device_vector;
+
+namespace {
+
+template<int BLOCK_NUM, int BLOCK_DIM, int LOG_TILE>
+__global__ void l2_bw(float* dsink, const float* array, int count)
+{
+    int    tid = threadIdx.x + (blockIdx.x >> LOG_TILE) * blockDim.x;
+    float4 sink{};
+
+    constexpr int NUM_THREADS = BLOCK_NUM * BLOCK_DIM;
+
+    for (int i = 0; i < count; i += NUM_THREADS * 4) {
+        const float* ptr    = array + i;
+        const int    offset = tid * 4;
+        float4       data   = __ldcg(reinterpret_cast<const float4*>(ptr + offset));
+        sink.x += data.x;
+        sink.y += data.y;
+        sink.z += data.z;
+        sink.w += data.w;
+    }
+
+    dsink[threadIdx.x] = sink.x + sink.y + sink.z + sink.w;
+}
+
+}  // namespace
+
+float MeasureL2CacheThroughput()
+{
+    cudaDeviceProp prop{};
+    int            device{};
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&prop, device);
+
+    size_t size = static_cast<size_t>(prop.l2CacheSize) * 64;
+
+    std::cout << size << std::endl;
+
+    constexpr int BLOCK_X  = 128;  // blocks participating single sweep
+    constexpr int BLOCK_Y  = 128;  // full sweep iters
+    constexpr int LOG_TILE = 5;    // swizzling factor to bring up L2 hit rate, set to 0 will minimize hit rate
+
+    constexpr int BLOCK_DIM = 256;
+
+    constexpr int CHUNK_SIZE = BLOCK_X * BLOCK_DIM * 4;  // x4 for float4 load pattern
+
+    device_vector<float> data(ceil_div(size, sizeof(float)) / CHUNK_SIZE * CHUNK_SIZE);
+    device_vector<float> dsink(BLOCK_DIM);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    cudaMemsetAsync(data.data().get(), 0, sizeof(float) * data.size(), stream);
+
+    cudaEvent_t ev_start, ev_end;
+
+    cudaEventCreate(&ev_start);
+    cudaEventCreate(&ev_end);
+
+    cudaEventRecord(ev_start, stream);
+
+    l2_bw<BLOCK_X, BLOCK_DIM, LOG_TILE><<<dim3(BLOCK_X << LOG_TILE, BLOCK_Y >> LOG_TILE), BLOCK_DIM, 0, stream>>>(
+        dsink.data().get(), data.data().get(), data.size());
+
+    cudaEventRecord(ev_end, stream);
+
+    cudaEventSynchronize(ev_end);
+
+    float ms{};
+    cudaEventElapsedTime(&ms, ev_start, ev_end);
+
+    size_t bytes = BLOCK_Y * sizeof(float) * data.size();
+
+    const float bytes_per_second = bytes / ms * 1e3;
+    std::cout << bytes_per_second / 1e9 << " GB/s" << std::endl;
+
+    cudaEventDestroy(ev_start);
+    cudaEventDestroy(ev_end);
+
+    cudaStreamDestroy(stream);
+
+    return bytes_per_second;
+}
+
+float MeasureMmaThroughput(int problem_size)
+{
+    device_vector<half> a(problem_size * problem_size);
+    device_vector<half> b(a.size());
+    device_vector<half> c(a.size());
+
+    cublasHandle_t cublas{};
+    cublasCreate(&cublas);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    cublasSetStream(cublas, stream);
+
+    cudaEvent_t ev_start, ev_end;
+
+    cudaEventCreate(&ev_start);
+    cudaEventCreate(&ev_end);
+
+    cudaEventRecord(ev_start, stream);
+
+    float alpha = 1.f;
+    float beta  = 0.f;
+    cublasGemmEx(cublas,
+                 CUBLAS_OP_N,
+                 CUBLAS_OP_N,
+                 problem_size,
+                 problem_size,
+                 problem_size,
+                 &alpha,
+                 a.data().get(),
+                 CUDA_R_16F,
+                 problem_size,
+                 b.data().get(),
+                 CUDA_R_16F,
+                 problem_size,
+                 &beta,
+                 c.data().get(),
+                 CUDA_R_16F,
+                 problem_size,
+                 CUBLAS_COMPUTE_32F,
+                 CUBLAS_GEMM_DEFAULT);
+
+    cudaEventRecord(ev_end, stream);
+
+    cudaEventSynchronize(ev_end);
+
+    float ms{};
+    cudaEventElapsedTime(&ms, ev_start, ev_end);
+
+    cudaEventDestroy(ev_start);
+    cudaEventDestroy(ev_end);
+
+    cudaStreamDestroy(stream);
+
+    cublasDestroy(cublas);
+
+    const size_t ops = (size_t)problem_size * problem_size * problem_size;
+
+    float fma_per_second = ops / ms * 1e3;
+
+    std::cout << 2 * fma_per_second / 1e9 << " FLOPS/s" << std::endl;
+
+    return fma_per_second;
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/gpu_metric.h b/src/turbomind/kernels/gemm/gpu_metric.h
new file mode 100644
index 0000000000..b1f9f9938b
--- /dev/null
+++ b/src/turbomind/kernels/gemm/gpu_metric.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+// bytes / second
+float MeasureL2CacheThroughput();
+
+// fused multiply-add / second
+float MeasureMmaThroughput(int proble_size = 16384);
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/iterator.h b/src/turbomind/kernels/gemm/iterator.h
new file mode 100644
index 0000000000..71152c11fe
--- /dev/null
+++ b/src/turbomind/kernels/gemm/iterator.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+struct VoidGmemIter {
+    static constexpr int ITER_S = 0;
+    using Fragments             = int;
+    template<class P>
+    __device__ VoidGmemIter(P, int, int2, int2)
+    {
+    }
+    __device__ void ClearSmem() {}
+    __device__ void Prefetch(int, int, bool) {}
+    __device__ void Prefetch(bool) {}
+    __device__ void Fetch(Fragments&, bool) {}
+    __device__ void Store(const Fragments&) {}
+    __device__ void Advance() {}
+    int*            smem_data_;
+    bool            g_mask{false};
+};
+
+struct GetGmemIter {
+    template<class Operand, class Iterator, class SmemLayout, int M, int K, int WARPS>
+    static constexpr auto
+        apply(basic_type<Operand>, basic_type<Iterator>, basic_type<SmemLayout>, pair<M, K>, constant<WARPS>)
+    {
+        using Dtype = typename Operand::Dtype;
+
+        constexpr int kAccessSize =
+            std::min<int>(128 / bitsof<Dtype>, std::max<int>(32 / bitsof<Dtype>, M * K / (WARPS * WARP_SIZE)));
+
+        constexpr int2 kAligned = mk2cs<Operand::kOrder>(0, 1);
+        constexpr int2 kCS      = mk2cs<Operand::kOrder>(M, K);
+
+        constexpr int kMaxThrS = std::min(WARP_SIZE, ceil_div(kCS.y, WARPS));
+        constexpr int kMaxThrC = std::min(WARP_SIZE, ceil_div(kCS.x, kAccessSize));
+
+        constexpr int kTgtThrC = ceil_div<int>(256, sizeof(Array<Dtype, kAccessSize>));
+
+        constexpr int kWarpThrC = std::min(kMaxThrC, std::max(WARP_SIZE / kMaxThrS, kTgtThrC));
+
+        using GmemIter = typename Iterator::template Type<Dtype,
+                                                          gemm::ThreadMap_V2<kCS.x, kCS.y, kAccessSize, Blocked, WARPS>,
+                                                          SmemLayout,
+                                                          Operand::kPack,
+                                                          Operand::kOrder,
+                                                          kAligned.x,   // aligned C
+                                                          kAligned.y>;  // aligned S
+        return type_c<GmemIter>;
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/iterator_sm70.h b/src/turbomind/kernels/gemm/iterator_sm70.h
new file mode 100644
index 0000000000..4f32777da0
--- /dev/null
+++ b/src/turbomind/kernels/gemm/iterator_sm70.h
@@ -0,0 +1,265 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/gemm/cp_async.h"
+#include "src/turbomind/kernels/gemm/predicate.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cassert>
+#include <type_traits>
+
+namespace turbomind::gemm {
+
+template<typename T, int N>
+inline __device__ void _Ld(Array<T, N>& dst, const T* src)
+{
+    static_assert(sizeof(Array<T, N>) <= sizeof(uint4));
+
+    if constexpr (sizeof(Array<T, N>) == sizeof(uint4)) {
+        (uint4&)dst = __ldcs((const uint4*)src);
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint2)) {
+        (uint2&)dst = __ldcs((const uint2*)src);
+    }
+    else if constexpr (sizeof(Array<T, N>) == sizeof(uint)) {
+        (uint&)dst = __ldcs((const uint*)src);
+    }
+    else {
+        static_assert(!std::is_same_v<T, T>);
+    }
+}
+
+template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS, class Policy_>
+struct GmemIteratorSm70 {
+
+    using ThreadMap = Map;
+
+    using AccessType = Array<T, Map::kAccessC>;
+    using Pointer    = get_pointer_type<T>;
+
+    using Policy = Policy_;
+
+    static constexpr int ITER_S = Map::kIterS;
+    static constexpr int ITER_C = Map::kIterC;
+
+    const char* src_data_;
+
+    int src_offset_;
+    int dst_offset_;
+
+    int offset_c_;
+    int offset_s_;
+
+    int src_step_c_;
+    int src_step_s_;
+
+    int src_step_k_;
+
+    Predicate<Map::kIterS, Map::kIterC, (AlignedC && Map::kAlignedC), (AlignedS && Map::kAlignedS)> pred_;
+
+    bool g_mask{true};
+
+    SmemAccessor<T, SmemLayout> smem_data_;
+
+    static constexpr int2 kMK0     = cs2mk<kOrder>(SmemLayout::C0, SmemLayout::S0);
+    static constexpr int  kPeriodC = ceil_div(SmemLayout::C0, Map::kDeltaC);
+    static constexpr int  kPeriodS = ceil_div(SmemLayout::S0, Map::kDeltaS);
+
+    int phases_[kPeriodS][kPeriodC];
+
+    using Fragments = AccessType[Map::kIterS][Map::kIterC];
+
+    __device__ static constexpr int2 pack(int2 mk)
+    {
+        return Packing_v2<kPack, kOrder>::apply(mk);
+    }
+
+    __device__ static constexpr int2 to_cs(int2 mk)
+    {
+        return mk2cs<kOrder>(mk.x, mk.y);
+    }
+
+    __device__ GmemIteratorSm70(): smem_data_{Pointer{nullptr}} {};
+
+    __device__ GmemIteratorSm70(Pointer data, int stride_s, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
+    {
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+
+        data   = data + cs2idx(to_cs(pack(offset)), stride_s);
+        extent = to_cs(pack(extent));
+
+        int2 offsets    = Map::get_offset(warp_id, lane_id);
+        int  src_offset = offsets.x + offsets.y * stride_s;
+
+        offset_c_ = offsets.x;
+        offset_s_ = offsets.y;
+
+        auto src_ptr = reinterpret_cast<const char*>((T*)data);
+
+        if constexpr (pred_.is_active) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < Map::kIterS; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < Map::kIterC; ++c) {
+                    int ss = offset_s_ + s * Map::kDeltaS;
+                    int cc = offset_c_ + c * Map::kDeltaC;
+                    if (ss < extent.y && cc < extent.x) {
+                        pred_.set(s, c);
+                    }
+                }
+            }
+        }
+
+        PRAGMA_UNROLL
+        for (int s = 0; s < kPeriodS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < kPeriodC; ++c) {
+                phases_[s][c] = SmemLayout::apply(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+            }
+        }
+
+        src_offset_ = src_offset * bitsof<T> / bitsof<char>;
+
+        src_step_c_ = bitsof<T> * Map::kDeltaC / bitsof<char>;
+        src_step_s_ = bitsof<T> * Map::kDeltaS * stride_s / bitsof<char>;
+
+        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * stride_s).y / bitsof<char>;
+
+        // initialize for the first tile
+        src_data_ = src_ptr + src_offset_;
+    }
+
+    __device__ constexpr int _src_step_k() const
+    {
+        return src_step_k_;
+    }
+
+    __device__ void ClearSmem(int pipe_iter = 0)
+    {
+        PRAGMA_UNROLL
+        for (int s = 0; s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                const int pred_s = offset_s_ + s * Map::kDeltaS < Map::kDimS;
+                const int pred_c = offset_c_ + c * Map::kDeltaC < Map::kDimC;
+                auto      ptr    = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+                if ((Map::kAlignedC && Map::kAlignedS) || (pred_s && pred_c)) {
+                    turbomind::Store(ptr, Array<T, Map::kAccessC>{});
+                }
+            }
+        }
+    }
+
+    __device__ void Prefetch(int begin, int count, bool tile_mask)
+    {
+        PRAGMA_UNROLL
+        for (int s = begin; s < begin + count && s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                // auto dst = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+
+                const int i0  = SmemLayout::apply(  //
+                    s / kPeriodS * kPeriodS * Map::kDeltaS,
+                    c / kPeriodC * kPeriodC * Map::kDeltaC);
+                const int i1  = phases_[s % kPeriodS][c % kPeriodC];
+                auto      dst = &smem_data_.ptr_[i0 + i1];
+
+                Copy(std::true_type{}, dst, src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
+            }
+            src_data_ += src_step_s_;
+            if (s == Map::kIterS - 1) {
+                src_data_ -= src_step_s_ * Map::kIterS;
+                src_data_ += _src_step_k();
+            }
+        }
+    }
+
+    __device__ void Prefetch(bool tile_mask)
+    {
+        Prefetch(0, Map::kIterS, tile_mask);
+    }
+
+    __device__ void Advance()
+    {
+        if (!g_mask) {
+            src_data_ -= _src_step_k();
+        }
+    }
+
+    __device__ void Copy(std::true_type, T* dst, const char* __restrict__ src, bool mask)
+    {
+        if (mask) {
+            AccessType frag;
+            if constexpr (Policy_::kEvictPolicy != EvictPolicy::kEvictNormal) {
+                _Ld(frag, (const T*)src);
+            }
+            else {
+                Ldg(frag, (const T*)src);
+            }
+            turbomind::Store(dst, frag);
+        }
+    }
+
+    __device__ void Fetch(Fragments& frags, bool tile_mask)
+    {
+        PRAGMA_UNROLL
+        for (int s = 0; s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                Copy2(frags[s][c], src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
+            }
+            src_data_ += src_step_s_;
+            if (s == Map::kIterS - 1) {
+                src_data_ -= src_step_s_ * Map::kIterS;
+                src_data_ += _src_step_k();
+            }
+        }
+    }
+
+    __device__ void Store(Fragments& frags)
+    {
+        PRAGMA_UNROLL
+        for (int s = 0; s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                // auto dst = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+
+                const int i0  = SmemLayout::apply(  //
+                    s / kPeriodS * kPeriodS * Map::kDeltaS,
+                    c / kPeriodC * kPeriodC * Map::kDeltaC);
+                const int i1  = phases_[s % kPeriodS][c % kPeriodC];
+                auto      dst = &smem_data_.ptr_[i0 + i1];
+
+                if (pred_(s, c)) {
+                    turbomind::Store(dst, frags[s][c]);
+                }
+            }
+        }
+    }
+
+    __device__ void Copy2(AccessType& frag, const char* __restrict__ src, bool mask)
+    {
+        if (mask) {
+            if constexpr (Policy_::kEvictPolicy != EvictPolicy::kEvictNormal) {
+                _Ld(frag, (const T*)src);
+            }
+            else {
+                Ldg(frag, (const T*)src);
+            }
+        }
+    }
+};
+
+template<class Policy>
+struct IteratorSm70 {
+    template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS>
+    using Type = GmemIteratorSm70<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, Policy>;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/iterator_sm80.h b/src/turbomind/kernels/gemm/iterator_sm80.h
new file mode 100644
index 0000000000..eab85e67b7
--- /dev/null
+++ b/src/turbomind/kernels/gemm/iterator_sm80.h
@@ -0,0 +1,213 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/gemm/cp_async.h"
+#include "src/turbomind/kernels/gemm/predicate.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cassert>
+#include <type_traits>
+
+namespace turbomind::gemm {
+
+template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS, class Policy_>
+struct GmemIteratorSm80 {
+
+    using ThreadMap = Map;
+
+    using AccessType = Array<T, Map::kAccessC>;
+    using Pointer    = get_pointer_type<T>;
+
+    using Policy = Policy_;
+
+    static constexpr int ITER_S = Map::kIterS;
+    static constexpr int ITER_C = Map::kIterC;
+
+    const char* src_data_;
+
+    int src_offset_;
+    int dst_offset_;
+
+    int offset_c_;
+    int offset_s_;
+
+    int src_step_c_;
+    int src_step_s_;
+
+    int src_step_k_;
+
+    Predicate<Map::kIterS, Map::kIterC, (AlignedC && Map::kAlignedC), (AlignedS && Map::kAlignedS)> pred_;
+
+    bool g_mask{true};
+
+    SmemAccessor<T, SmemLayout> smem_data_;
+
+    static constexpr int2 kMK0     = cs2mk<kOrder>(SmemLayout::C0, SmemLayout::S0);
+    static constexpr int  kPeriodC = ceil_div(SmemLayout::C0, Map::kDeltaC);
+    static constexpr int  kPeriodS = ceil_div(SmemLayout::S0, Map::kDeltaS);
+
+    int phases_[kPeriodS][kPeriodC];
+
+    uint64_t cache_policy_{};
+
+    __device__ static constexpr int2 pack(int2 mk)
+    {
+        return Packing_v2<kPack, kOrder>::apply(mk);
+    }
+
+    __device__ static constexpr int2 to_cs(int2 mk)
+    {
+        return mk2cs<kOrder>(mk.x, mk.y);
+    }
+
+    __device__ GmemIteratorSm80(): smem_data_{Pointer{nullptr}} {};
+
+    __device__ GmemIteratorSm80(Pointer data, int stride_s, int2 offset, int2 extent): smem_data_{Pointer{(T*)nullptr}}
+    {
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+
+        data   = data + cs2idx(to_cs(pack(offset)), stride_s);
+        extent = to_cs(pack(extent));
+
+        int2 offsets    = Map::get_offset(warp_id, lane_id);
+        int  src_offset = offsets.x + offsets.y * stride_s;
+
+        offset_c_ = offsets.x;
+        offset_s_ = offsets.y;
+
+        auto src_ptr = reinterpret_cast<const char*>((T*)data);
+
+        if constexpr (pred_.is_active) {
+            PRAGMA_UNROLL
+            for (int s = 0; s < Map::kIterS; ++s) {
+                PRAGMA_UNROLL
+                for (int c = 0; c < Map::kIterC; ++c) {
+                    int ss = offset_s_ + s * Map::kDeltaS;
+                    int cc = offset_c_ + c * Map::kDeltaC;
+                    if (ss < extent.y && cc < extent.x) {
+                        pred_.set(s, c);
+                    }
+                }
+            }
+        }
+
+        PRAGMA_UNROLL
+        for (int s = 0; s < kPeriodS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < kPeriodC; ++c) {
+                phases_[s][c] = SmemLayout::apply(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+            }
+        }
+
+        src_offset_ = src_offset * bitsof<T> / bitsof<char>;
+
+        src_step_c_ = bitsof<T> * Map::kDeltaC / bitsof<char>;
+        src_step_s_ = bitsof<T> * Map::kDeltaS * stride_s / bitsof<char>;
+
+        src_step_k_ = bitsof<T> * cs2mk<kOrder>(Map::kDimC, Map::kDimS * stride_s).y / bitsof<char>;
+
+        // initialize for the first tile
+        src_data_ = src_ptr + src_offset_;
+
+#if TURBOMIND_ARCH_SM80
+        if constexpr (Policy::kEvictPolicy != EvictPolicy::kEvictNormal) {
+            asm volatile("createpolicy.fractional.L2::evict_first.b64 %0;\n" : "=l"(cache_policy_) :);
+        }
+#endif
+    }
+
+    __device__ constexpr int _src_step_k() const
+    {
+        return src_step_k_;
+    }
+
+    __device__ void ClearSmem(int pipe_iter = 0)
+    {
+        PRAGMA_UNROLL
+        for (int s = 0; s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                const int pred_s = offset_s_ + s * Map::kDeltaS < Map::kDimS;
+                const int pred_c = offset_c_ + c * Map::kDeltaC < Map::kDimC;
+                auto      ptr    = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+                if ((Map::kAlignedC && Map::kAlignedS) || (pred_s && pred_c)) {
+                    Store(ptr, Array<T, Map::kAccessC>{});
+                }
+            }
+        }
+    }
+
+    __device__ void Prefetch(int begin, int count, bool tile_mask)
+    {
+        PRAGMA_UNROLL
+        for (int s = begin; s < begin + count && s < Map::kIterS; ++s) {
+            PRAGMA_UNROLL
+            for (int c = 0; c < Map::kIterC; ++c) {
+                // auto dst = &smem_data_(offset_s_ + s * Map::kDeltaS, offset_c_ + c * Map::kDeltaC);
+
+                const int i0  = SmemLayout::apply(  //
+                    s / kPeriodS * kPeriodS * Map::kDeltaS,
+                    c / kPeriodC * kPeriodC * Map::kDeltaC);
+                const int i1  = phases_[s % kPeriodS][c % kPeriodC];
+                auto      dst = &smem_data_.ptr_[i0 + i1];
+
+                CpAsync(std::true_type{}, dst, src_data_ + src_step_c_ * c, tile_mask && g_mask && pred_(s, c));
+            }
+            src_data_ += src_step_s_;
+            if (s == Map::kIterS - 1) {
+                src_data_ -= src_step_s_ * Map::kIterS;
+                src_data_ += _src_step_k();
+            }
+        }
+    }
+
+    __device__ void Prefetch(bool tile_mask)
+    {
+        Prefetch(0, Map::kIterS, tile_mask);
+    }
+
+    __device__ void Advance()
+    {
+        if (!g_mask) {
+            src_data_ -= _src_step_k();
+        }
+    }
+
+    __device__ void CpAsync(std::true_type, T* dst, const char* __restrict__ src, bool mask)
+    {
+#if TURBOMIND_ARCH_SM80
+        constexpr int size = sizeof(AccessType);
+        static_assert(size <= 16);
+
+        constexpr int prefetch_size = std::min(256, size * Map::kWarpThreadC);
+
+        auto ptr = cast_smem_ptr_to_uint(dst);
+
+        static constexpr auto cache_op = GetCacheOp<Policy::kCacheOp, size>::value;
+
+        if constexpr (Policy::kEvictPolicy != EvictPolicy::kEvictNormal) {
+            CP_ASYNC<cache_op, size, prefetch_size>::apply(ptr, src, cache_policy_, mask);
+        }
+        else {
+            CP_ASYNC<cache_op, size, prefetch_size>::apply(ptr, src, mask);
+        }
+#else
+        assert(TURBOMIND_ARCH_SM80);
+#endif
+    }
+};
+
+template<class Policy>
+struct IteratorSm80 {
+    template<class T, class Map, class SmemLayout, Pack kPack, Order kOrder, bool AlignedC, bool AlignedS>
+    using Type = GmemIteratorSm80<T, Map, SmemLayout, kPack, kOrder, AlignedC, AlignedS, Policy>;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel.cu b/src/turbomind/kernels/gemm/kernel.cu
new file mode 100644
index 0000000000..4b2eddc36a
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel.cu
@@ -0,0 +1,234 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+
+namespace turbomind::gemm {
+
+bool Kernel::is_feasible(const GemmDesc& desc) const noexcept
+{
+    constexpr bool debug = 0;
+
+    if constexpr (debug)
+        printf("S\n");
+
+    if (!is_arch_compatible(desc_.arch, desc.arch)) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("S0\n");
+
+    if (std::tie(desc.order_a, desc.order_b, desc.order_c) != std::tie(desc_.order_a, desc_.order_b, desc_.order_c)) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("A\n");
+
+    if (std::tie(desc.type_a, desc.type_b, desc.type_c) != std::tie(desc_.type_a, desc_.type_b, desc_.type_c)) {
+        return false;
+    }
+
+    if constexpr (debug) {
+        printf("B\n");
+        printf("%X %X %X %X\n", desc.pack_a, desc_.pack_a, desc.pack_u, desc_.pack_u);
+    }
+
+    if (std::tie(desc.pack_a, desc.pack_u) != std::tie(desc_.pack_a, desc_.pack_u)) {
+        return false;
+    }
+
+    if constexpr (debug) {
+        printf("C\n");
+        printf("%X %X %X %X\n", desc.pack_b, desc_.pack_b, desc.pack_v, desc_.pack_v);
+    }
+
+    if (std::tie(desc.pack_b, desc.pack_v) != std::tie(desc_.pack_b, desc_.pack_v)) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("D\n");
+
+    if (desc.quant_a.type != desc_.quant_a.type || desc.quant_a.group_size != desc_.quant_a.group_size) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("E\n");
+
+    if (desc.quant_b.type != desc_.quant_b.type || desc.quant_b.group_size != desc_.quant_b.group_size) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("F\n");
+
+    if (desc.m % desc_.align.x || desc.n % desc_.align.y || desc.k % desc_.align.z) {
+        return false;
+    }
+
+    if constexpr (debug)
+        printf("success\n");
+
+    return true;
+}
+
+std::vector<std::pair<int, KernelMetric>>
+Kernel::Estimate_v2(std::array<int, 3> size, int max_splits, int max_waves, int sm_count) const
+{
+    const auto [m, n, k]        = size;
+    const int64_t tiled_shape_m = ceil_div(m, desc_.cta_tile.x);
+    const int64_t tiled_shape_n = ceil_div(n, desc_.cta_tile.y);
+    const int     chunk_cnt_k   = ceil_div(k, chunk_size_k_);
+
+    // Despite we only have sm_count * constant tensor cores, this is the granularity for scheduling
+    const int   concurrency     = sm_count * desc_.max_active_ctas;
+    const float waves_per_split = float(tiled_shape_m * tiled_shape_n) / concurrency;
+    const float splits_per_wave = 1.f / waves_per_split;
+
+    // Tile quantization
+    const int64_t ceil_m = tiled_shape_m * desc_.cta_tile.x;
+    const int64_t ceil_n = tiled_shape_n * desc_.cta_tile.y;
+
+    std::vector<std::pair<int, KernelMetric>> metrics;
+
+    for (int splits = 1; splits <= max_splits; ++splits) {
+        // Split quantization, penalize uneven splits
+        const int64_t split_ceil_k = ceil_div(chunk_cnt_k, splits) * chunk_size_k_;
+        // Footprint for single split
+        const int64_t split_mma_cost = ceil_m * ceil_n * split_ceil_k;
+        // Footprint for single wave
+        const int64_t wave_mma_cost = split_mma_cost * splits_per_wave;
+
+        // Wave quantization
+        // const int waves = (int)std::ceil(wave_per_split * splits);
+
+        // Bold simulation of thread block scheduling
+        const int   grid_size    = tiled_shape_m * tiled_shape_n * splits;
+        const int   full_waves   = grid_size / concurrency;
+        const int   residue      = grid_size % concurrency;
+        const float partial_wave = (float)ceil_div(residue, sm_count) / desc_.max_active_ctas;
+        const float waves        = full_waves + partial_wave;
+
+        if (splits > 1 && waves > max_waves) {
+            break;
+        }
+        // ceil(tiled_mn / C * splits) * C / tiled_mn * ceil_m * ceil_n * split_ceil_k
+        const int64_t mma_cost = wave_mma_cost * waves;
+
+        // IO has less severe quantization effect
+        const int64_t mio_cost_a = get_size(desc_.type_a, tiled_shape_n * m * split_ceil_k) * splits;
+        const int64_t mio_cost_b = get_size(desc_.type_b, tiled_shape_m * n * split_ceil_k) * splits;
+        /// TODO: read type from `desc_.accum` when added
+        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)m * n) * (splits - 1) * 2;
+        const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
+
+        // std::cout << name() << " " << splits << " " << waves << " " << (float)mio_cost << " " << (float)mma_cost
+        //           << "\n";
+
+        metrics.emplace_back(splits, KernelMetric{mio_cost, mma_cost});
+    }
+
+    return metrics;
+}
+
+std::string Kernel::GetName() const
+{
+    std::stringstream ss;
+
+    ss << "sm" << desc_.arch / 10;
+    ss << "_" << to_string(desc_.type_a);  //
+    if ((int)desc_.quant_a.type) {
+        ss << "g" << desc_.quant_a.group_size;
+    }
+    ss << "_" << to_string(desc_.type_b);  //
+    if ((int)desc_.quant_b.type) {
+        ss << "g" << desc_.quant_b.group_size;
+    }
+    ss << "_" << to_string(desc_.type_c);
+    ss << "_"                                                                            //
+       << (desc_.order_a == kColMajor ? 'n' : 't')                                       //
+       << (desc_.order_b == kColMajor ? 'n' : 't')                                       //
+       << (desc_.order_c == kColMajor ? 'n' : 't');                                      //
+    ss << "_" << desc_.cta_tile.x << "x" << desc_.cta_tile.y << "x" << desc_.cta_tile.z  //
+       << "_" << desc_.stages                                                            //
+       << "_" << to_string(desc_.op_class)                                               //
+       << "_" << desc_.mma_tile.x << "x" << desc_.mma_tile.y << "x" << desc_.mma_tile.z  //
+       << "_c" << desc_.c_tile.x << "x" << desc_.c_tile.y                                //
+       << "_a" << desc_.align.x << "x" << desc_.align.y << "x" << desc_.align.z          //
+       << "_" << desc_.policy_a << desc_.policy_b;
+
+    return ss.str();
+}
+
+template<class Op>
+inline static bool cmp(const int3& a, const int3& b, Op op)
+{
+    return op(std::tie(a.x, a.y, a.z), std::tie(b.x, b.y, b.z));
+}
+
+std::vector<std::vector<LaunchSpec>> Cluster(const std::vector<LaunchSpec>& specs, const ClusteringParam& param)
+{
+    std::vector<const LaunchSpec*> ptrs;  // pointer into `specs`
+    for (auto& s : specs) {
+        ptrs.push_back(&s);
+    }
+
+    auto less = [&](const LaunchSpec* u, const LaunchSpec* v) {
+        const auto& a = u->kernel->desc();
+        const auto& b = v->kernel->desc();
+        if (!cmp(a.cta_tile, b.cta_tile, std::equal_to<>{})) {
+            return cmp(a.cta_tile, b.cta_tile, std::less<>{});
+        }
+        if (!cmp(a.mma_tile, b.mma_tile, std::equal_to<>{})) {
+            return cmp(a.mma_tile, b.mma_tile, std::less<>{});
+        }
+        if (param.cache_policy) {
+            const auto pa = std::tie(a.policy_a, a.policy_b);
+            const auto pb = std::tie(b.policy_a, b.policy_b);
+            if (pa != pb) {
+                return pa < pb;
+            }
+        }
+        if (param.max_active_ctas) {
+            if (a.max_active_ctas != b.max_active_ctas) {
+                return a.max_active_ctas < b.max_active_ctas;
+            }
+        }
+        return u->splits < v->splits;
+    };
+
+    std::stable_sort(ptrs.begin(), ptrs.end(), less);
+
+    if (ptrs.empty()) {
+        return {};
+    }
+    std::vector<std::vector<LaunchSpec>> clusters{{*ptrs[0]}};
+
+    auto equal = [&](const LaunchSpec* u, const LaunchSpec* v) {  //
+        return !less(u, v) && !less(v, u);
+    };
+    int p = 0;
+    for (size_t i = 1; i < ptrs.size(); ++i) {
+        if (equal(ptrs[p], ptrs[i])) {
+            clusters.back().push_back(*ptrs[i]);
+        }
+        else {
+            clusters.push_back({*ptrs[i]});
+            p = i;
+        }
+    }
+
+    return clusters;
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel.h b/src/turbomind/kernels/gemm/kernel.h
new file mode 100644
index 0000000000..34e6094887
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel.h
@@ -0,0 +1,118 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <array>
+#include <cuda_runtime.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace turbomind::gemm {
+
+struct KernelMetric {
+    int64_t mio_cost;
+    int64_t mma_cost;
+};
+
+class Kernel {
+public:
+    virtual ~Kernel() = default;
+
+    virtual int Launch(const Operation&    operation,
+                       float               alpha,
+                       const void*         A,
+                       const MatrixLayout& Adesc,
+                       const void*         U,
+                       const MatrixLayout& Udesc,
+                       const void*         B,
+                       const MatrixLayout& Bdesc,
+                       const void*         V,
+                       const MatrixLayout& Vdesc,
+                       float               beta,
+                       const void*         C,
+                       const MatrixLayout& Cdesc,
+                       void*               D,
+                       const MatrixLayout& Ddesc,
+                       int                 swizzle,
+                       int                 splits,
+                       Workspace&          workspace,
+                       cudaStream_t        stream) = 0;
+
+    // virtual because different implementation may have different workspace requeirements
+    virtual int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) = 0;
+
+    // true if this kernel can be used to compute the gemm
+    bool is_feasible(const GemmDesc& desc) const noexcept;
+
+    std::vector<std::pair<int, KernelMetric>>
+    Estimate_v2(std::array<int, 3> size, int max_splits, int max_waves, int sm_count) const;
+
+    virtual int GetSwizzle(int m, int n, int k, int splits, int swizzle) = 0;
+
+    const KernelDesc& desc() const noexcept
+    {
+        return desc_;
+    }
+
+    int3 cta_tile_size() const noexcept
+    {
+        return desc_.cta_tile;
+    }
+
+    int3 warp_tile_size() const noexcept
+    {
+        return desc_.mma_tile;
+    }
+
+    int chunk_size_k() const noexcept
+    {
+        return chunk_size_k_;
+    }
+
+    int stages() const noexcept
+    {
+        return desc_.stages;
+    }
+
+    bool split_k() const noexcept
+    {
+        return desc_.split_k;
+    }
+
+    int arch() const noexcept
+    {
+        return desc_.arch;
+    }
+
+    int smem_size() const noexcept
+    {
+        return smem_size_;
+    }
+
+    std::string name() const
+    {
+        return name_;
+    }
+
+protected:
+    std::string GetName() const;
+
+    KernelDesc desc_;
+
+    int chunk_size_k_;
+    int smem_size_;
+
+    std::string name_;
+};
+
+struct ClusteringParam {
+    bool cache_policy;
+    bool max_active_ctas;
+};
+
+std::vector<std::vector<LaunchSpec>> Cluster(const std::vector<LaunchSpec>& specs, const ClusteringParam& param);
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
new file mode 100644
index 0000000000..3d678df08e
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm70_s884.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+void Registry::f16_u4g128_f16_tnt_sm70_s884()
+{
+    using namespace sm70_s884;
+    {  // quant B
+        using Config = Sm70_s884<typename GetOperand<HMMA_884, OPERAND_A, half, kRowMajor, false>::Operand,
+                                 Transform_Default,
+                                 VoidOperand,
+                                 typename GetOperand<HMMA_884, OPERAND_B, uint4_t, kRowMajor, true>::Operand,
+                                 Transform_HMMA_SIMT_B,
+                                 typename GetOperand<HMMA_884, OPERAND_V, uint32_t, kColMajor, true>::Operand,
+                                 kRowMajor,
+                                 half>;
+
+        using namespace cache_policy;
+
+        // m8n32k8: pack_bv=1
+        // (8,226.234),(16,192.248),(32,120.564),(64,103.483),(96,98.209),(128,54.537),(192,13.739)
+        // (256,-6.61),(4096,-16.622),(8192,-16.021)
+        Add<Config::Type<128, 256, 16, 2, 4, 1, Default, Default, 2, true, 1, 128, 128, 128>>();  // 50.631
+        Add<Config::Type<128, 128, 16, 2, 2, 1, Default, Default, 2, true, 1, 128, 64, 128>>();
+        Add<Config::Type<128, 128, 16, 2, 2, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 50.698
+        Add<Config::Type<96, 128, 32, 2, 2, 1, Default, Stream, 2, true, 1, 128, 48, 128>>();   // 93.395
+        Add<Config::Type<64, 128, 32, 2, 2, 1, Default, Default, 2, true, 1, 128, 32, 128>>();
+        Add<Config::Type<64, 128, 32, 2, 2, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 93.482
+        Add<Config::Type<64, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 82.113
+        Add<Config::Type<64, 256, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 80.686
+        Add<Config::Type<32, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 92.014
+        Add<Config::Type<32, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 110.979
+        Add<Config::Type<16, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 147.616
+        Add<Config::Type<16, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 186.569
+        Add<Config::Type<8, 128, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 218.194
+        Add<Config::Type<8, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 209.224
+        Add<Config::Type<8, 256, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 219.651
+
+        // m16n16k8: pack_bv=2
+        // (8,179.471),(16,174.246),(32,114.659),(64,100.813),(96,96.822),(128,53.423),(192,12.433),(256,-7.601),(4096,-17.335)
+        // Add<Config::Type<128, 256, 16, 1, 8, 1, Default, Default, 2, true, 1, 128, 128, 128>>(); // 50.934
+        // Add<Config::Type<128, 128, 16, 1, 4, 1, Default, Default, 2, true, 1, 128, 64, 128>>();  // 47.874
+        // Add<Config::Type<128, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 47.874
+        // Add<Config::Type<96, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>(); // 95.303
+        // Add<Config::Type<64, 128, 32, 1, 4, 1, Default, Default, 2, true, 1, 128>>();
+        // Add<Config::Type<64, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 97.095
+        // Add<Config::Type<64, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 86.559
+        // Add<Config::Type<64, 256, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>(); // 73.869
+        // Add<Config::Type<32, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 115.205
+        // Add<Config::Type<32, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 96.151
+        // Add<Config::Type<16, 128, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 175.285
+        // Add<Config::Type<16, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
new file mode 100644
index 0000000000..e2a0e3c4a5
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
@@ -0,0 +1,44 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm75_s16816.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+void Registry::f16_u4g128_f16_tnt_sm75_s16816()
+{
+    using namespace sm75_s16816;
+
+    {  // fp x u4
+        using C = Sm75_s16816<Operand_A<half, kRowMajor>,
+                              Transform_Default,
+                              VoidOperand,
+                              Operand_B_Pack<uint4_t, kColMajor>,
+                              Transform_HMMA_16816<1, 0>,
+                              Operand_UV_Pack<uint32_t, true>,
+                              kRowMajor,
+                              half>;
+
+        using S = cache_policy::Stream;
+        using D = cache_policy::Default;
+
+        // clang-format off
+        Add<C::Type<128, 256, 32, 1, 8, 1, D, D, 2, true, 1, 128, 128, 128>>();
+        Add<C::Type<128, 128, 32, 1, 4, 1, D, D, 2, true, 1, 128,  64, 128>>();
+        Add<C::Type< 96,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 64, 128, 32, 1, 4, 1, D, D, 2, true, 1, 128,  32, 128>>();
+        Add<C::Type< 64, 128, 32, 1, 4, 1, D, S, 2, true, 1, 128,  32, 128>>();
+        Add<C::Type< 64,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 48, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 48,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 32,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 128, 32, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
+        // clang-format on
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
new file mode 100644
index 0000000000..a97919d20c
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_simt.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+void Registry::f16_u4g128_f16_tnt_sm75_simt()
+{
+    using namespace simt;
+
+    using S = cache_policy::Stream;
+    using D = cache_policy::Default;
+
+    {  // quant B
+        using Operand_A    = typename GetOperand<HMMA_SIMT, OPERAND_A, half, kRowMajor, false>::Operand;
+        using Operand_B_U4 = typename GetOperand<HMMA_SIMT, OPERAND_B, uint4_t, kRowMajor, true>::Operand;
+        using Operand_V    = typename GetOperand<HMMA_SIMT, OPERAND_V, uint32_t, kColMajor, true>::Operand;
+
+        using C = Sm75_Simt<Operand_A,
+                            Transform_Default,
+                            VoidOperand,
+                            Operand_B_U4,
+                            Transform_HMMA_SIMT_B,
+                            Operand_V,
+                            kRowMajor,
+                            half>;
+
+        // clang-format off
+        Add<C::Type<128, 128, 16, 8, 1, 1, D, D, 2, true, 1, 128>>();
+        Add<C::Type< 64, 128, 16, 4, 1, 1, D, D, 2, true, 1, 128>>();
+        Add<C::Type< 64, 128, 16, 4, 1, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 32, 128, 32, 4, 1, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type< 16, 128, 32, 2, 2, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type<  8, 128, 64, 2, 2, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type<  4, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        Add<C::Type<  1, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
+        // clang-format on
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
new file mode 100644
index 0000000000..8b188a44d2
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
@@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+void Registry::f16_u4g128_f16_tnt_sm80_s16816()
+{
+    using namespace sm80_s16816;
+    using namespace cache_policy;
+    using S = cache_policy::Stream;
+    using D = cache_policy::Default;
+
+    using C = Sm80_s16816<Sm80,
+                          Operand_A<half, kRowMajor>,          // A
+                          Transform_Default,                   // tarnsform A
+                          VoidOperand,                         // U
+                          Operand_B_Pack<uint4_t, kColMajor>,  // B
+                          Transform_HMMA_16816<1, 0>,          // transform B
+                          Operand_UV_Pack<uint32_t, true>,     // V
+                          kRowMajor,                           // order_C
+                          half>;                               // Tc
+
+    // clang-format off
+    // Add<C::Type<128, 256,  64, 1, 8, 1, D, S, 3, true, 1, 128>>(); // 0/0
+    Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128, 128, 128>>(); // 30/3
+    Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128, 128, 128>>(); // --/20
+    Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();  // --/13
+    Add<C::Type<128, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128, 64, 128>>();  // 21/13
+    Add<C::Type<128, 128,  64, 1, 4, 2, D, S, 3, true, 1, 128, 64, 128>>();  // 6/6
+
+    Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128>>();  // --/3
+    Add<C::Type<96, 256,  32, 1, 8, 1, D, S, 3, true, 1, 128>>();  // 13/13
+    Add<C::Type<96, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 14/10
+    Add<C::Type<96, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 2/2
+
+    Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>(); // --/21
+    Add<C::Type<64, 256,  32, 1, 4, 1, D, S, 4, true, 1, 128, 64, 128>>(); // 27/13
+    Add<C::Type<64, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 8/5
+    Add<C::Type<64, 128,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 7/5
+    Add<C::Type<64, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 6/7
+    Add<C::Type<64,  64,  64, 1, 2, 2, D, S, 6, true, 1, 128>>();
+
+    Add<C::Type<48, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128, 48, 128>>(); // 1/1
+    Add<C::Type<48, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 1/1
+    Add<C::Type<48, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 4/4
+    Add<C::Type<48,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
+
+    Add<C::Type<32, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();
+    Add<C::Type<32, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();
+    Add<C::Type<32, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();
+    Add<C::Type<32,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();
+    Add<C::Type<32,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
+
+    Add<C::Type<16, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();
+    Add<C::Type<16, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();
+    Add<C::Type<16, 128, 128, 1, 4, 2, D, S, 4, true, 1, 128>>();
+    Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();
+    Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
+    // clang-format on
+}
+
+// sm80_f16_u4g128_f16_ttt_128x256x32_4_s16816_1x8x1_c128x128_a1x32x32_00: 46
+// sm80_f16_u4g128_f16_ttt_128x128x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 27
+// sm80_f16_u4g128_f16_ttt_64x256x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 21
+// sm80_f16_u4g128_f16_ttt_64x256x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 19
+// sm80_f16_u4g128_f16_ttt_16x128x128_4_s16816_1x4x2_c16x128_a1x32x128_01: 17
+// sm80_f16_u4g128_f16_ttt_32x128x128_3_s16816_1x4x2_c32x128_a1x32x128_01: 16
+// sm80_f16_u4g128_f16_ttt_64x128x128_3_s16816_1x4x2_c64x128_a1x32x128_01: 16
+// sm80_f16_u4g128_f16_ttt_96x128x32_4_s16816_1x4x1_c96x128_a1x32x32_01: 16
+// sm80_f16_u4g128_f16_ttt_96x256x32_4_s16816_1x8x1_c96x256_a1x32x32_00: 15
+// sm80_f16_u4g128_f16_ttt_16x64x128_3_s16816_1x2x2_c16x64_a1x32x128_01: 13
+// sm80_f16_u4g128_f16_ttt_16x128x64_4_s16816_1x4x1_c16x128_a1x32x64_01: 13
+// sm80_f16_u4g128_f16_ttt_48x128x128_3_s16816_1x4x2_c48x128_a1x32x128_01: 13
+// sm80_f16_u4g128_f16_ttt_48x256x64_3_s16816_1x4x1_c48x128_a1x32x64_01: 13
+// sm80_f16_u4g128_f16_ttt_16x64x128_4_s16816_1x2x2_c16x64_a1x32x128_01: 11
+// sm80_f16_u4g128_f16_ttt_64x128x64_3_s16816_1x4x1_c64x128_a1x32x64_01: 9
+// sm80_f16_u4g128_f16_ttt_128x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 9
+// sm80_f16_u4g128_f16_ttt_96x128x128_3_s16816_1x4x2_c96x128_a1x32x128_01: 7
+// sm80_f16_u4g128_f16_ttt_96x256x32_3_s16816_1x8x1_c96x256_a1x32x32_01: 7
+// sm80_f16_u4g128_f16_ttt_48x128x64_4_s16816_1x4x1_c48x128_a1x32x64_01: 6
+// sm80_f16_u4g128_f16_ttt_32x64x128_4_s16816_1x2x2_c32x64_a1x32x128_01: 5
+// sm80_f16_u4g128_f16_ttt_32x256x64_3_s16816_1x4x1_c32x256_a1x32x64_01: 5
+// sm80_f16_u4g128_f16_ttt_64x64x64_6_s16816_1x2x2_c64x64_a1x32x64_01: 5
+// sm80_f16_u4g128_f16_ttt_16x128x128_3_s16816_1x4x2_c16x128_a1x32x128_01: 4
+// sm80_f16_u4g128_f16_ttt_32x128x64_4_s16816_1x4x1_c32x128_a1x32x64_01: 4
+// sm80_f16_u4g128_f16_ttt_48x64x128_4_s16816_1x2x2_c48x64_a1x32x128_01: 4
+// sm80_f16_u4g128_f16_ttt_64x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 4
+// sm80_f16_u4g128_f16_ttt_128x128x64_3_s16816_1x4x2_c64x128_a1x32x64_01: 4
+// sm80_f16_u4g128_f16_ttt_128x256x32_3_s16816_1x8x1_c128x128_a1x32x32_00: 4
+// sm80_f16_u4g128_f16_ttt_32x64x128_3_s16816_1x2x2_c32x64_a1x32x128_01: 3
+// sm80_f16_u4g128_f16_ttt_128x256x64_3_s16816_1x8x1_c128x256_a1x32x64_01: 0
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
new file mode 100644
index 0000000000..908451ed00
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+void Registry::f16_u4g128_f16_tnt_sm90_s16816()
+{
+    using namespace sm80_s16816;
+    using namespace cache_policy;
+    //////////////////////////////////////////////////////////////////////////////
+    // ! sm_90 + cp.async + evict policy = warp illegal instruction
+    //////////////////////////////////////////////////////////////////////////////
+    using D = cache_policy::Default;
+
+    using C = Sm80_s16816<Sm90,
+                          Operand_A<half, kRowMajor>,          // A
+                          Transform_Default,                   // tarnsform A
+                          VoidOperand,                         // U
+                          Operand_B_Pack<uint4_t, kColMajor>,  // B
+                          Transform_HMMA_16816<1, 0>,          // transform B
+                          Operand_UV_Pack<uint32_t, true>,     // V
+                          kRowMajor,                           // order_C
+                          half>;                               // Tc
+
+    // clang-format off
+    Add<C::Type<128, 256,  64, 1, 8, 1, D, D, 3, true, 1, 128>>();
+    Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128, 128, 128>>();
+    Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128, 128, 128>>();
+    Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();
+    Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128, 64, 128>>();
+    Add<C::Type<128, 128,  64, 1, 4, 2, D, D, 3, true, 1, 128, 64, 128>>();
+
+    Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128>>();
+    Add<C::Type<96, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<96, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
+
+    Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();
+    Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 4, true, 1, 128, 64, 128>>();
+    Add<C::Type<64, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<64, 128,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+    Add<C::Type<64, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<64,  64,  64, 1, 2, 2, D, D, 6, true, 1, 128>>();
+
+    Add<C::Type<48, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128, 48, 128>>();
+    Add<C::Type<48, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<48, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<48,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
+
+    Add<C::Type<32, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
+    Add<C::Type<32, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<32, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<32,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<32,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
+
+    Add<C::Type<16, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
+    Add<C::Type<16, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<16, 128, 128, 1, 4, 2, D, D, 4, true, 1, 128>>();
+    Add<C::Type<16,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
+    Add<C::Type<16,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
+    // clang-format on
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu b/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
new file mode 100644
index 0000000000..4e1a071ae7
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
@@ -0,0 +1,118 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
+#include "src/turbomind/kernels/gemm/cta_map.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+#include "src/turbomind/kernels/gemm/transform.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+using namespace sm80_s16816;
+template<int N>
+using Config_ = Sm80_s16816<Sm80,
+                            Operand_A_Pack<uint4_t, kColMajor>,  // A
+                            Transform_HMMA_16816<0, 1>,          // tarnsform A
+                            Operand_UV_Pack<uint32_t, false>,    // U
+                            Operand_B<half, kRowMajor, N>,       // B
+                            Transform_Default,                   // transform B
+                            VoidOperand,                         // V
+                            kColMajor,                           // order_C
+                            half,                                // Tc
+                            CtaMapN>;
+
+void Registry::u4g128_f16_f16_nnn_sm80_s16816()
+{
+    // ! Must be M-major MMA
+#if 0
+    using namespace cache_policy;
+
+    using C16 = Config_<16>;
+    using S   = Stream;
+    using D   = Default;
+
+    // clang-format off
+    Add<C16::Type<256, 128, 32, 8, 1, 1, D, D, 3, true, 128, 1, 128, 128>>();
+    // Add<C16::Type<256, 128, 64, 8, 1, 1, D, D, 3, true, 128, 1, 128, 128>>();
+    Add<C16::Type<128, 128, 32, 4, 1, 1, D, D, 6, true, 128, 1, 128, 64>>();
+    Add<C16::Type<128, 128, 32, 4, 1, 1, D, D, 4, true, 128, 1, 128, 64>>();
+    Add<C16::Type<128, 128, 32, 4, 1, 1, S, D, 6, true, 128, 1, 128, 64>>();
+    Add<C16::Type<128, 128, 32, 4, 1, 1, S, D, 4, true, 128, 1, 128, 64>>();
+
+    Add<C16::Type<128, 96, 64, 4, 1, 1, D, D, 3, true, 128, 1, 128, 48>>();
+    Add<C16::Type<128, 96, 64, 4, 1, 1, S, D, 3, true, 128, 1, 128, 48>>();
+
+    Add<C16::Type<256, 64, 32, 4, 1, 1, D, D, 3, true, 128, 1, 128, 64>>();
+    Add<C16::Type<256, 64, 64, 4, 1, 1, D, D, 3, true, 128, 1, 128, 64>>();
+    Add<C16::Type<128, 64, 32, 4, 1, 1, D, D, 5, true, 128, 1>>();
+    Add<C16::Type< 64, 64, 64, 2, 1, 2, D, D, 3, true, 128, 1>>();
+    Add<C16::Type<256, 64, 32, 4, 1, 1, S, D, 3, true, 128, 1, 128, 64>>();
+    Add<C16::Type<256, 64, 64, 4, 1, 1, S, D, 3, true, 128, 1, 128, 64>>();
+    Add<C16::Type<128, 64, 32, 4, 1, 1, S, D, 5, true, 128, 1>>();
+    Add<C16::Type< 64, 64, 64, 2, 1, 2, S, D, 3, true, 128, 1>>();
+
+    Add<C16::Type<256, 48,  64, 4, 1, 1, S, D, 3, true, 128, 1, 128, 48>>();
+    Add<C16::Type<128, 48,  64, 4, 1, 1, S, D, 4, true, 128, 1>>();
+    Add<C16::Type<128, 48, 128, 4, 1, 2, S, D, 3, true, 128, 1>>();
+    Add<C16::Type< 64, 48, 128, 2, 1, 2, S, D, 3, true, 128, 1>>();
+
+    Add<C16::Type<128, 32,  64, 4, 1, 1, S, D, 4, true, 128, 1>>();
+    Add<C16::Type<128, 32, 128, 4, 1, 2, S, D, 5, true, 128, 1>>();
+    Add<C16::Type< 64, 32, 128, 2, 1, 2, S, D, 4, true, 128, 1>>();
+    Add<C16::Type< 64, 32, 128, 2, 1, 2, S, D, 3, true, 128, 1>>();
+
+    Add<C16::Type<128, 16,  64, 4, 1, 1, S, D, 4, true, 128, 1>>();
+    Add<C16::Type<128, 16,  64, 4, 1, 1, S, D, 3, true, 128, 1>>();
+    Add<C16::Type<128, 16, 128, 4, 1, 2, S, D, 4, true, 128, 1>>();
+    Add<C16::Type< 64, 16, 128, 2, 1, 2, S, D, 4, true, 128, 1>>();
+    Add<C16::Type< 64, 16, 128, 2, 1, 2, S, D, 3, true, 128, 1>>();
+
+    using C8 = Config_<8>;
+    Add<C8::Type<128, 8, 128, 4, 1, 2, S, D, 4, true, 128, 1>>();
+    Add<C8::Type<128, 8,  64, 4, 1, 1, S, D, 3, true, 128, 1>>();
+    Add<C8::Type<128, 8,  32, 4, 1, 1, S, D, 6, true, 128, 1>>();
+    Add<C8::Type< 64, 8, 128, 2, 1, 2, S, D, 5, true, 128, 1>>();
+    Add<C8::Type< 32, 8, 128, 1, 1, 4, S, D, 5, true, 128, 1>>();
+
+    // clang-format on
+
+#endif
+}
+
+// sm80_u4g128_f16_f16_nnn_256x128x32_3_s16816_8x1x1_c128x128_a32x1x32_00: 48
+// sm80_u4g128_f16_f16_nnn_256x48x64_3_s16816_4x1x1_c128x48_a32x1x64_10: 22
+// sm80_u4g128_f16_f16_nnn_128x64x32_5_s16816_4x1x1_c128x64_a32x1x32_10: 17
+// sm80_u4g128_f16_f16_nnn_128x96x64_3_s16816_4x1x1_c128x48_a32x1x64_10: 17
+// sm80_u4g128_f16_f16_nnn_128x128x32_6_s16816_4x1x1_c128x64_a32x1x32_10: 17
+// sm80_u4g128_f16_f16_nnn_128x32x64_4_s16816_4x1x1_c128x32_a32x1x64_10: 16
+// sm80_u4g128_f16_f16_nnn_128x48x128_3_s16816_4x1x2_c128x48_a32x1x128_10: 16
+// sm80_u4g128_f16_f16_nnn_128x128x32_6_s16816_4x1x1_c128x64_a32x1x32_00: 15
+// sm80_u4g128_f16_f16_nnn_256x64x32_3_s16816_4x1x1_c128x64_a32x1x32_00: 14
+// sm80_u4g128_f16_f16_nnn_256x64x32_3_s16816_4x1x1_c128x64_a32x1x32_10: 13
+// sm80_u4g128_f16_f16_nnn_64x8x128_5_s16816_2x1x2_c64x8_a32x1x128_10: 11
+// sm80_u4g128_f16_f16_nnn_128x96x64_3_s16816_4x1x1_c128x48_a32x1x64_00: 11
+// sm80_u4g128_f16_f16_nnn_128x128x32_4_s16816_4x1x1_c128x64_a32x1x32_00: 11
+// sm80_u4g128_f16_f16_nnn_128x16x128_4_s16816_4x1x2_c128x16_a32x1x128_10: 10
+// sm80_u4g128_f16_f16_nnn_64x64x64_3_s16816_2x1x2_c64x64_a32x1x64_10: 10
+// sm80_u4g128_f16_f16_nnn_128x64x32_5_s16816_4x1x1_c128x64_a32x1x32_00: 10
+// sm80_u4g128_f16_f16_nnn_64x16x128_4_s16816_2x1x2_c64x16_a32x1x128_10: 8
+// sm80_u4g128_f16_f16_nnn_64x32x128_3_s16816_2x1x2_c64x32_a32x1x128_10: 8
+// sm80_u4g128_f16_f16_nnn_128x48x64_4_s16816_4x1x1_c128x48_a32x1x64_10: 8
+// sm80_u4g128_f16_f16_nnn_64x32x128_4_s16816_2x1x2_c64x32_a32x1x128_10: 7
+// sm80_u4g128_f16_f16_nnn_128x32x128_5_s16816_4x1x2_c128x32_a32x1x128_10: 7
+// sm80_u4g128_f16_f16_nnn_64x16x128_3_s16816_2x1x2_c64x16_a32x1x128_10: 6
+// sm80_u4g128_f16_f16_nnn_128x8x128_4_s16816_4x1x2_c128x8_a32x1x128_10: 5
+// sm80_u4g128_f16_f16_nnn_128x16x64_4_s16816_4x1x1_c128x16_a32x1x64_10: 5
+// sm80_u4g128_f16_f16_nnn_128x8x32_6_s16816_4x1x1_c128x8_a32x1x32_10: 4
+// sm80_u4g128_f16_f16_nnn_128x8x64_3_s16816_4x1x1_c128x8_a32x1x64_10: 4
+// sm80_u4g128_f16_f16_nnn_128x16x64_3_s16816_4x1x1_c128x16_a32x1x64_10: 4
+// sm80_u4g128_f16_f16_nnn_32x8x128_5_s16816_1x1x4_c32x8_a32x1x128_10: 3
+// sm80_u4g128_f16_f16_nnn_64x48x128_3_s16816_2x1x2_c64x48_a32x1x128_10: 3
+// sm80_u4g128_f16_f16_nnn_256x64x64_3_s16816_4x1x1_c128x64_a32x1x64_10: 2
+// sm80_u4g128_f16_f16_nnn_128x128x32_4_s16816_4x1x1_c128x64_a32x1x32_10: 2
+// sm80_u4g128_f16_f16_nnn_64x64x64_3_s16816_2x1x2_c64x64_a32x1x64_00: 1
+// sm80_u4g128_f16_f16_nnn_256x64x64_3_s16816_4x1x1_c128x64_a32x1x64_00: 1
+// sm80_u4g128_f16_f16_nnn_256x128x64_3_s16816_8x1x1_c128x128_a32x1x64_00: 0
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/kernel_impl.h b/src/turbomind/kernels/gemm/kernel_impl.h
new file mode 100644
index 0000000000..171b0b6952
--- /dev/null
+++ b/src/turbomind/kernels/gemm/kernel_impl.h
@@ -0,0 +1,320 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/gemm/gemm_universal.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/thread_group_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+template<class Gemm>
+class KernelImpl: public Kernel {
+public:
+    // import frequently used constants
+    static constexpr int CTA_M = Gemm::CTA_M;
+    static constexpr int CTA_N = Gemm::CTA_N;
+    static constexpr int CTA_K = Gemm::CTA_K;
+
+    using Impl = typename Gemm::Impl;
+
+    using OpA = typename Gemm::OperandA;
+    using OpB = typename Gemm::OperandB;
+    using OpU = typename Gemm::OperandU;
+    using OpV = typename Gemm::OperandV;
+
+    KernelImpl()
+    {
+        desc_.order_a = OpA::kOrder;
+        desc_.order_b = transpose(OpB::kOrder);
+        desc_.order_c = Gemm::kOrderC;
+
+        desc_.type_a = get_data_type_v<typename Gemm::Ta>;
+        desc_.type_b = get_data_type_v<typename Gemm::Tb>;
+        desc_.type_c = get_data_type_v<typename Gemm::Tc>;
+
+        desc_.pack_a = OpA::kPack;
+        desc_.pack_b = OpB::kPack;
+        desc_.pack_u = OpU::kPack;
+        desc_.pack_v = OpV::kPack;
+
+        desc_.quant_a = QuantDesc{};
+        desc_.quant_b = QuantDesc{};
+
+        if constexpr (OpU::SmemLayout::kSize > 1) {
+            desc_.quant_a = QuantDesc{QuantType::kDefault, OpU::kGroupSize};
+        }
+
+        if constexpr (OpV::SmemLayout::kSize > 1) {
+            desc_.quant_b = QuantDesc{QuantType::kDefault, OpV::kGroupSize};
+        }
+
+        desc_.cta_tile = {Gemm::CTA_M, Gemm::CTA_N, Gemm::CTA_K};
+        desc_.mma_tile = {Impl::MMA_Map::kGroupM, Impl::MMA_Map::kGroupN, Impl::MMA_Map::kGroupK};
+        chunk_size_k_  = Gemm::kChunkSizeK;
+
+        using IterA = typename OpA::GmemIter;
+        using IterB = typename OpB::GmemIter;
+
+        desc_.align.x = OpA::kOrder == kColMajor ? IterA::ThreadMap::kAccessC : 1;
+        desc_.align.y = OpB::kOrder == kColMajor ? IterB::ThreadMap::kAccessC : 1;
+        desc_.align.z = Gemm::CTA_K;
+
+        desc_.policy_a = (int)IterA::Policy::kEvictPolicy;
+        desc_.policy_b = (int)IterB::Policy::kEvictPolicy;
+        desc_.c_tile   = {Gemm::Epilogue::TM, Gemm::Epilogue::TN};
+        desc_.op_class = Impl::kOpClass;
+
+        smem_size_ = sizeof(typename Gemm::SharedStorage);
+
+        desc_.stages  = Impl::Stages;
+        desc_.split_k = Gemm::SplitK;
+
+        desc_.arch = Gemm::Arch::value;
+
+        using Params = typename Gemm::Param;
+        using CtaMap = typename Gemm::CtaMap;
+
+        auto func = gemm_kernel<Gemm, Params, CtaMap>;
+
+        if (smem_size_ > (48 << 10)) {
+            cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_);
+        }
+
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &desc_.max_active_ctas, gemm_kernel<Gemm, Params, CtaMap>, Impl::WARPS * WARP_SIZE, smem_size_);
+
+        cudaFuncGetAttributes(&desc_.attr, func);
+
+        name_ = GetName();
+    }
+
+    int Launch(const Operation&    operation,
+               float               alpha,
+               const void*         A,
+               const MatrixLayout& Adesc,
+               const void*         U,
+               const MatrixLayout& Udesc,
+               const void*         B,
+               const MatrixLayout& _Bdesc,
+               const void*         V,
+               const MatrixLayout& _Vdesc,
+               float               beta,
+               const void*         C,
+               const MatrixLayout& Cdesc,
+               void*               D,
+               const MatrixLayout& Ddesc,
+               int                 swizzle,
+               int                 splits,
+               Workspace&          workspace,
+               cudaStream_t        stream) override
+    {
+        using Map = typename Gemm::CtaMap;
+
+        const int m = Ddesc.rows;
+        const int n = Ddesc.cols;
+        const int k = Adesc.cols;
+
+        auto transpose = [](MatrixLayout x) {
+            std::swap(x.rows, x.cols);
+            x.order = gemm::transpose(x.order);
+            return x;
+        };
+
+        const MatrixLayout Bdesc = transpose(_Bdesc);
+        const MatrixLayout Vdesc = transpose(_Vdesc);
+
+        const int chunk_cnt = ceil_div(k, Gemm::kChunkSizeK);
+
+        // Limit splits by num of chunks to avoid chaos
+        splits = std::min(chunk_cnt, splits);
+
+        auto tiles = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
+
+        if (splits > 1) {
+            size_t bsize{}, psize{};
+            GetWorkspaceSizes(m, n, tiles.x, tiles.y, splits, bsize, psize);
+            const int max_splits = GetMaxSplits(m, n, k, workspace.barriers_size, workspace.partials_size);
+            if (workspace.barriers_size < bsize || workspace.partials_size < psize) {
+                fprintf(
+                    stderr,
+                    "Problem size (%d, %d, %d), workspace size too small (%d, %d) vs required (%d, %d) for %d splits. Force `splits` = %d\n",
+                    m,
+                    n,
+                    k,
+                    (int)workspace.barriers_size,
+                    (int)workspace.partials_size,
+                    (int)bsize,
+                    (int)psize,
+                    splits,
+                    max_splits);
+                splits = max_splits;
+                tiles  = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
+            }
+        }
+
+        swizzle = Map::get_log_tile(tiles, 1 << swizzle);
+
+        const auto grid  = Map::get_grid_shape(tiles, swizzle);
+        const auto block = Gemm::Impl::WARPS * WARP_SIZE;
+
+        using Ta = typename Gemm::Ta;
+        using Tb = typename Gemm::Tb;
+        using Tu = typename Gemm::Tu;
+        using Tv = typename Gemm::Tv;
+        using Tc = typename Gemm::Tc;
+
+        if constexpr (0) {
+            [[maybe_unused]] static const int _ = [] {
+                std::cout << "A:\n";
+                Print(typename Gemm::OperandA::GmemIter::ThreadMap{});
+                std::cout << "\nB:\n";
+                Print(typename Gemm::OperandB::GmemIter::ThreadMap{});
+                if constexpr (!std::is_same_v<Ta, Tc>) {
+                    std::cout << "\nU:\n";
+                    Print(typename Gemm::OperandU::GmemIter::ThreadMap{});
+                }
+                if constexpr (!std::is_same_v<Tb, Tc>) {
+                    std::cout << "\nV:\n";
+                    Print(typename Gemm::OperandV::GmemIter::ThreadMap{});
+                }
+                printf("warp count: %d\n", Impl::WARPS);
+                Print_(typename Gemm::Impl::MMA_Map{});
+
+                printf("C:\n");
+                Print(typename Gemm::Epilogue::Map{});
+
+                std::cout << "Smem for mainloop: " << sizeof(Gemm::SharedStorage::mainloop) << "\n";
+                std::cout << "Smem for epilogue: " << sizeof(Gemm::SharedStorage::epilogue) << "\n";
+
+                return 0;
+            }();
+        }
+
+        int lda = Adesc.ld;
+        int ldb = Bdesc.ld;
+
+        if (Gemm::kPackA) {
+            lda = mk2cs<Gemm::kOrderA>(Packing_v2<Gemm::kPackA, Gemm::kOrderA>::apply({m, k})).x;
+        }
+        if (Gemm::kPackB) {
+            ldb = mk2cs<Gemm::kOrderB>(Packing_v2<Gemm::kPackB, Gemm::kOrderB>::apply({n, k})).x;
+        }
+
+        // std::cout << "lda=" << lda << ", ldb=" << ldb << ", ldc=" << Cdesc.ld << "\n";
+
+        // std::cout << "C: " << C << ", D: " << D << "\n";
+
+        const bool silu_act = ((int)operation.epilogue & (int)Epilogue::kGatedSilu);
+
+        const int partial_C_ld = mk2cs<Gemm::kOrderC>(Ddesc.rows, Ddesc.cols).x;
+
+        EpilogueParam<Tc> epilogue{m,
+                                   n,
+                                   (Tc*)D,
+                                   Ddesc.ld,
+                                   (float*)workspace.partials,
+                                   partial_C_ld,
+                                   (int*)workspace.barriers,
+                                   {alpha, beta, (const Tc*)C, Cdesc.ld},
+                                   silu_act};
+
+        const int chunk_per_split = chunk_cnt / splits;
+        const int chunk_remianing = chunk_cnt % splits;
+        const int chunk_offset    = splits - chunk_remianing;
+        // chunk_id = z * chunk_per_split + max(z - (splits - chunk_remaining), 0);
+        // offset_k = chunk_id * kChunkSizeK;
+        // gemm_k_size = offset_k + (chunk_per_split + int(z > chunk_offset)) * kChunkSizeK
+        // gemm_k_size = std::min(gemm_k_size, k) - offset_k
+
+        // std::cout << k << " " << Gemm::kChunkSizeK << " " << splits << " " << chunk_per_split << " " <<
+        // chunk_remianing << " " << chunk_offset << "\n";
+
+        typename Gemm::Param param{m,
+                                   n,
+                                   k,
+                                   typename Gemm::PtrA{(Ta*)A},
+                                   lda,
+                                   (Tu*)U,
+                                   Udesc.ld,
+                                   typename Gemm::PtrB{(Tb*)B},
+                                   ldb,
+                                   (Tv*)V,
+                                   Vdesc.ld,
+                                   swizzle,
+                                   tiles,
+                                   chunk_per_split,
+                                   chunk_offset,
+                                   epilogue};
+
+        gemm_kernel<Gemm><<<grid, block, smem_size_, stream>>>(param, Map{});
+
+        return 0;
+    }
+
+    template<class T>
+    static auto _cast(T* p)
+    {
+        if constexpr (bitsof<T> % 8 == 0) {
+            return p;
+        }
+        else {
+            return (char*)p;
+        }
+    }
+
+    // ! This assumes N results in 16 byte aligned partials
+    void
+    GetWorkspaceSizes(int m, int n, int tiled_m, int tiled_n, int splits, size_t& barriers_size, size_t& partials_size)
+    {
+        static constexpr bool kSerial = true;
+
+        partials_size = sizeof(float) * m * n;
+        barriers_size = sizeof(int) * tiled_m * tiled_n;
+
+        if constexpr (!kSerial) {
+            partials_size *= splits;
+            barriers_size *= splits;
+        }
+    }
+
+    int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) override
+    {
+        if (!Gemm::SplitK) {  // kernel has no split-k support
+            return 1;
+        }
+
+        const int tiled_m = ceil_div(m, CTA_M);
+        const int tiled_n = ceil_div(n, CTA_N);
+
+        size_t bsize_1split{};
+        size_t psize_1split{};
+
+        // workspace for 1 non-trival split
+        GetWorkspaceSizes(m, n, tiled_m, tiled_n, 1, bsize_1split, psize_1split);
+
+        if (barrier_size >= bsize_1split && partials_size >= psize_1split) {
+            // Serial split-k requires workspace for 1 split only
+            // But it can't exceed num of k chunks
+            const int chunk_cnt = ceil_div(k, Gemm::kChunkSizeK);
+            return std::min(chunk_cnt, 32);
+        }
+        else {
+            return 1;
+        }
+    }
+
+    int GetSwizzle(int m, int n, int k, int splits, int swizzle) override
+    {
+        using Map        = typename Gemm::CtaMap;
+        const auto tiles = Map::get_tiled_shape(m, n, k, CTA_M, CTA_N, splits);
+        return Map::get_log_tile(tiles, 1 << swizzle);
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/mainloop_sm70.h b/src/turbomind/kernels/gemm/mainloop_sm70.h
new file mode 100644
index 0000000000..17df36b4e7
--- /dev/null
+++ b/src/turbomind/kernels/gemm/mainloop_sm70.h
@@ -0,0 +1,354 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cuda_pipeline_primitives.h>
+
+namespace turbomind::gemm {
+
+template<int Stages>
+struct GroupIter {
+
+    static_assert((Stages & (Stages - 1)) == 0);
+
+    int iter_ = 0;
+
+    __device__ void Advance()
+    {
+        iter_ = (iter_ + 1) % Stages;
+    }
+
+    __device__ constexpr explicit operator bool()
+    {
+        return iter_ == 0;
+    }
+};
+
+template<>
+struct GroupIter<1> {
+    __device__ void               Advance() {}
+    __device__ constexpr explicit operator bool()
+    {
+        return true;
+    }
+};
+
+template<class Pointer, int Step, int Stages>
+struct SmemIter {
+    Pointer pointer;
+    Pointer other_;
+
+    __device__ SmemIter(Pointer base): pointer{base}, other_{base + Step} {}
+
+    __device__ void Advance()
+    {
+        auto tmp = pointer;
+        pointer  = other_;
+        other_   = tmp;
+    }
+};
+
+template<class A, class B, class U, class V>
+struct Binding {
+    A&         a;
+    B&         b;
+    U&         u;
+    V&         v;
+    __device__ Binding(A& a, B& b, U& u, V& v): a{a}, b{b}, u{u}, v{v} {}  // CTAD
+};
+
+// Inspired by
+// https://github.com/NVIDIA/cutlass/blob/f93a69134ec8259fd235f220209d6f8734a5cb06/include/cutlass/gemm/threadblock/mma_pipelined.h
+template<class MMA,
+         class OperandA_,
+         class IteratorA_,
+         class TransformA_,
+         class OperandU_,
+         int GroupSizeU_,
+         class OperandB_,
+         class IteratorB_,
+         class TransformB_,
+         class OperandV_,
+         int  GroupSizeV_,
+         int  Stages_,
+         bool FusePrefetch_>
+struct MainloopSm70 {
+
+    using MMA_Atom = typename MMA::Atom;
+    using MMA_Map  = typename MMA::Map;
+
+    using FragC = typename MMA_Atom::FragC[MMA::kMmaIterM][MMA::kMmaIterN];
+
+    static constexpr int Stages = Stages_;
+
+    static constexpr int CTA_M = MMA::M;
+    static constexpr int CTA_N = MMA::N;
+    static constexpr int CTA_K = MMA::K;
+
+    static constexpr auto kOpClass = MMA_Atom::kOpClass;
+
+    static constexpr int WARPS = MMA::kThreadCount / WARP_SIZE;
+
+    using OperandA = MakeOperand<OperandA_, IteratorA_, CTA_M, CTA_K, WARPS>;
+    using OperandU = MakeOperand<OperandU_, IteratorA_, CTA_M, CTA_K, WARPS, GroupSizeU_>;
+
+    using OperandB = MakeOperand<OperandB_, IteratorB_, CTA_N, CTA_K, WARPS>;
+    using OperandV = MakeOperand<OperandV_, IteratorB_, CTA_N, CTA_K, WARPS, GroupSizeV_>;
+
+    using TransformA = TransformA_;
+    using TransformB = TransformB_;
+
+    using Ta = typename OperandA::Dtype;
+    using Tb = typename OperandB::Dtype;
+    using Tu = typename OperandU::Dtype;
+    using Tv = typename OperandV::Dtype;
+
+    using SmemLayoutA = typename OperandA::SmemLayout;
+    using SmemLayoutB = typename OperandB::SmemLayout;
+    using SmemLayoutU = typename OperandU::SmemLayout;
+    using SmemLayoutV = typename OperandV::SmemLayout;
+
+    using SmemCopyA = SmemCopy<OperandA, MMA_Map::kIterM, MMA_Map::kIterK, MMA_Map::kDeltaM, MMA_Map::kDeltaK>;
+    using SmemCopyU = SmemCopy<OperandU, MMA_Map::kIterM, MMA_Map::kIterK, MMA_Map::kDeltaM, MMA_Map::kDeltaK>;
+    using SmemCopyB = SmemCopy<OperandB, MMA_Map::kIterN, MMA_Map::kIterK, MMA_Map::kDeltaN, MMA_Map::kDeltaK>;
+    using SmemCopyV = SmemCopy<OperandV, MMA_Map::kIterN, MMA_Map::kIterK, MMA_Map::kDeltaN, MMA_Map::kDeltaK>;
+
+    using SmemAccessorA = SmemAccessor<Ta, SmemLayoutA>;
+    using SmemAccessorB = SmemAccessor<Tb, SmemLayoutB>;
+    using SmemAccessorU = SmemAccessor<Tu, SmemLayoutU>;
+    using SmemAccessorV = SmemAccessor<Tv, SmemLayoutV>;
+
+    using GmemIterA = typename OperandA::GmemIter;
+    using GmemIterB = typename OperandB::GmemIter;
+    using GmemIterU = typename OperandU::GmemIter;
+    using GmemIterV = typename OperandV::GmemIter;
+
+    struct SharedStorage {
+        __align__(16) Array<Ta, Stages * SmemLayoutA::kSize> A;
+        __align__(16) Array<Tb, Stages * SmemLayoutB::kSize> B;
+        __align__(16) Array<Tu, Stages * SmemLayoutU::kSize> U;
+        __align__(16) Array<Tv, Stages * SmemLayoutV::kSize> V;
+    };
+
+    template<class GmemIter, class SmemIter>
+    __device__ void _advance_smem(GmemIter& gmem_iter, SmemIter& smem_iter)
+    {
+        gmem_iter.smem_data_ = smem_iter.pointer;
+        smem_iter.Advance();
+    }
+
+    // zip with
+    template<class BindingG, class BindingS>
+    __device__ void AdvanceSmemStage(BindingG& g, BindingS& s)
+    {
+        _advance_smem(g.a, s.a);
+        _advance_smem(g.b, s.b);
+        _advance_smem(g.u, s.u);
+        _advance_smem(g.v, s.v);
+    }
+
+    template<class Binding>
+    __device__ void ClearSmem(Binding& g)
+    {
+        g.a.ClearSmem();
+        g.b.ClearSmem();
+        g.u.ClearSmem();
+        g.v.ClearSmem();
+    }
+
+    template<class Binding, class Fragments>
+    __device__ void Fetch(Binding& g, Fragments& f, bool mask)
+    {
+        g.a.Fetch(f.a, mask);
+        g.b.Fetch(f.b, mask);
+        g.u.Fetch(f.u, mask);
+        g.v.Fetch(f.v, mask);
+    }
+
+    template<class Binding, class Fragments>
+    __device__ void Store(Binding& g, Fragments& f)
+    {
+        g.a.Store(f.a);
+        g.b.Store(f.b);
+        g.u.Store(f.u);
+        g.v.Store(f.v);
+    }
+
+    template<class Binding>
+    __device__ void AdvanceGmemStage(Binding& g)
+    {
+        g.a.Advance();
+        g.b.Advance();
+        g.u.Advance();
+        g.v.Advance();
+    }
+
+    __device__ void operator()(GmemIterA&     gmem_A,
+                               GmemIterB&     gmem_B,
+                               GmemIterU&     gmem_U,
+                               GmemIterV&     gmem_V,
+                               FragC&         frag_C,
+                               int            tile_iter,
+                               SharedStorage& storage)
+    {
+        static_assert(MMA::kAtomK == 1);
+
+        static constexpr int UU = 1;  // ceil_div(GroupSizeU_, MMA_Map::TileK);
+        static constexpr int VV = 1;  // ceil_div(GroupSizeV_, MMA_Map::TileK);
+
+        // mma_iter_x = tile_iter_x * atom_x
+        typename MMA_Atom::FragA frag_A[MMA::kTileIterK][MMA::kMmaIterM];
+        typename MMA_Atom::FragB frag_B[MMA::kTileIterK][MMA::kMmaIterN];
+
+        typename SmemCopyA::Frag data_A[MMA::kTileIterK];
+        typename SmemCopyB::Frag data_B[MMA::kTileIterK];
+        typename SmemCopyU::Frag data_U[ceil_div(MMA::kTileIterK, UU)];
+        typename SmemCopyV::Frag data_V[ceil_div(MMA::kTileIterK, VV)];
+
+        SmemIter<get_pointer_type<Ta>, SmemLayoutA::kSize, Stages> smem_A{storage.A.data()};
+        SmemIter<get_pointer_type<Tb>, SmemLayoutB::kSize, Stages> smem_B{storage.B.data()};
+        SmemIter<get_pointer_type<Tu>, SmemLayoutU::kSize, Stages> smem_U{storage.U.data()};
+        SmemIter<get_pointer_type<Tv>, SmemLayoutV::kSize, Stages> smem_V{storage.V.data()};
+
+        typename GmemIterA::Fragments rmem_A;
+        typename GmemIterB::Fragments rmem_B;
+        typename GmemIterU::Fragments rmem_U;
+        typename GmemIterV::Fragments rmem_V;
+
+        GroupIter<ceil_div(GroupSizeU_, CTA_K)> gmem_group_iter_U{};
+        GroupIter<ceil_div(GroupSizeV_, CTA_K)> gmem_group_iter_V{};
+
+        auto smem_group_iter_U = gmem_group_iter_U;
+        auto smem_group_iter_V = gmem_group_iter_V;
+
+        // a separate counter tends to generate better code
+        int gmem_iter = tile_iter;
+        int gmem_mask = true;
+
+        Binding gmem_iters{gmem_A, gmem_B, gmem_U, gmem_V};
+        Binding smem_iters{smem_A, smem_B, smem_U, smem_V};
+        Binding rmem{rmem_A, rmem_B, rmem_U, rmem_V};
+
+        // r0,w_
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < Stages; ++i) {
+            AdvanceSmemStage(gmem_iters, smem_iters);
+            ClearSmem(gmem_iters);
+        }
+
+        // r0,w1
+
+        __syncthreads();
+
+        auto fetch_stage = [&](auto& rmem) {
+            Fetch(gmem_iters, rmem, gmem_mask);
+            AdvanceGmemStage(gmem_iters);
+            gmem_group_iter_U.Advance();
+            gmem_group_iter_V.Advance();
+            gmem_U.g_mask = (bool)gmem_group_iter_U;
+            gmem_V.g_mask = (bool)gmem_group_iter_V;
+            if (--gmem_iter == 0) {
+                gmem_mask = false;
+            }
+        };
+
+        auto advance_and_wait_smem_stage = [&] {
+            __syncthreads();
+            AdvanceSmemStage(gmem_iters, smem_iters);
+        };
+
+        const int3 offset_mnk = MMA::get_offset(threadIdx.x);
+        const int  offset_m   = offset_mnk.x;
+        const int  offset_n   = offset_mnk.y;
+        const int  offset_k   = offset_mnk.z;
+
+        SmemCopyA smem_copy_A{{offset_m, offset_k}};
+        SmemCopyU smem_copy_U{{offset_m, offset_k}};
+        SmemCopyB smem_copy_B{{offset_n, offset_k}};
+        SmemCopyV smem_copy_V{{offset_n, offset_k}};
+
+        auto preload = [&](int k) {
+            smem_copy_A(smem_A.pointer, data_A[k], k);
+            smem_copy_U(smem_U.pointer, data_U[k / UU], k, k % UU == 0 && (bool)smem_group_iter_U);
+
+            smem_copy_B(smem_B.pointer, data_B[k], k);
+            smem_copy_V(smem_V.pointer, data_V[k / VV], k, k % VV == 0 && (bool)smem_group_iter_V);
+        };
+
+        AdvanceSmemStage(gmem_iters, smem_iters);
+        // r1,w0
+
+        fetch_stage(rmem);  // gmem -> rmem
+
+        Store(gmem_iters, rmem);  // rmem -> smem
+
+        advance_and_wait_smem_stage();
+        // r0,w1
+
+        preload(0);  // smem -> data_[A,B,U,V]
+
+        TransformA::apply(frag_A, 0, data_A, data_U, UU);
+        TransformB::apply(frag_B, 0, data_B, data_V, VV);
+
+        PRAGMA_NO_UNROLL
+        for (; tile_iter > 0; --tile_iter) {
+            constexpr int ITER_K = MMA::kTileIterK;
+            static_assert(ITER_K > 1);
+
+            PRAGMA_UNROLL
+            for (int k = 0; k < ITER_K; ++k) {
+                // The last iter, store prefetched fragments to smem
+                if (k == ITER_K - 1) {
+                    Store(gmem_iters, rmem);
+                    advance_and_wait_smem_stage();  // swap rw
+                    smem_group_iter_U.Advance();
+                    smem_group_iter_V.Advance();
+                }
+
+                // Preload for next iter, smem -> data_[A,B,U,V]
+                preload((k + 1) % ITER_K);
+
+                // The first iter, issue the prefetching of next stage
+                if (k == 0) {
+                    fetch_stage(rmem);
+                }
+
+                // PRAGMA_UNROLL
+                // for (int n = 0; n < MMA::kMmaIterN; ++n) {
+                //     PRAGMA_UNROLL
+                //     for (int m = 0; m < MMA::kMmaIterM; ++m) {
+                //         int mm = n % 2 ? MMA::kMmaIterM - m - 1 : m;
+                //         MMA_Atom::fma(frag_C[mm][n], frag_A[k][mm], frag_B[k][n], frag_C[mm][n]);
+                //     }
+                // }
+
+                PRAGMA_UNROLL
+                for (int m = 0; m < MMA::kMmaIterM; ++m) {
+                    PRAGMA_UNROLL
+                    for (int n = 0; n < MMA::kMmaIterN; ++n) {
+                        int nn = m % 2 ? MMA::kMmaIterN - n - 1 : n;
+                        MMA_Atom::fma(frag_C[m][nn], frag_A[k][m], frag_B[k][nn], frag_C[m][nn]);
+                    }
+                }
+
+                TransformA::apply(frag_A, (k + 1) % ITER_K, data_A, data_U, UU);
+                TransformB::apply(frag_B, (k + 1) % ITER_K, data_B, data_V, VV);
+            }
+        }
+
+        __syncthreads();
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/mainloop_sm80_v2.h b/src/turbomind/kernels/gemm/mainloop_sm80_v2.h
new file mode 100644
index 0000000000..f9783f1304
--- /dev/null
+++ b/src/turbomind/kernels/gemm/mainloop_sm80_v2.h
@@ -0,0 +1,390 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cuda_pipeline_primitives.h>
+
+namespace turbomind::gemm {
+
+template<int Stages>
+struct GroupIter {
+
+    static_assert((Stages & (Stages - 1)) == 0);
+
+    int iter_ = 0;
+
+    __device__ void Advance()
+    {
+        iter_ = (iter_ + 1) % Stages;
+    }
+
+    __device__ constexpr explicit operator bool()
+    {
+        return iter_ == 0;
+    }
+};
+
+template<>
+struct GroupIter<1> {
+    __device__ void               Advance() {}
+    __device__ constexpr explicit operator bool()
+    {
+        return true;
+    }
+};
+
+template<class Pointer, int Step, int Stages>
+struct SmemIter {
+    Pointer base_;
+    Pointer pointer;
+    int     pipe_iter_;
+
+    __device__ SmemIter(Pointer base): base_{base}, pointer{base}, pipe_iter_{} {}
+
+    __device__ void Advance()
+    {
+        pipe_iter_ += 1;
+        pointer = pointer + Step;
+        if (pipe_iter_ == Stages) {
+            pipe_iter_ = 0;
+            pointer    = base_;
+        }
+    }
+};
+
+template<class A, class B, class U, class V>
+struct Binding {
+    A&         a;
+    B&         b;
+    U&         u;
+    V&         v;
+    __device__ Binding(A& a, B& b, U& u, V& v): a{a}, b{b}, u{u}, v{v} {}  // CTAD
+};
+
+// Inspired by
+// https://github.com/NVIDIA/cutlass/blob/f93a69134ec8259fd235f220209d6f8734a5cb06/include/cutlass/gemm/threadblock/mma_multistage.h
+// https://github.com/NVIDIA/cutlass/blob/f93a69134ec8259fd235f220209d6f8734a5cb06/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
+template<class MMA,
+         class OperandA_,
+         class IteratorA_,
+         class TransformA_,
+         class OperandU_,
+         int GroupSizeU_,
+         class OperandB_,
+         class IteratorB_,
+         class TransformB_,
+         class OperandV_,
+         int  GroupSizeV_,
+         int  Stages_,
+         bool FusePrefetch_>
+struct MainloopSm80_v2 {
+
+    using MMA_Atom = typename MMA::Atom;
+    using MMA_Map  = typename MMA::Map;
+
+    using FragC = typename MMA_Atom::FragC[MMA::kMmaIterM][MMA::kMmaIterN];
+
+    static constexpr int Stages = Stages_;
+
+    static constexpr int CTA_M = MMA::M;
+    static constexpr int CTA_N = MMA::N;
+    static constexpr int CTA_K = MMA::K;
+
+    static constexpr auto kOpClass = MMA_Atom::kOpClass;
+
+    static constexpr int WARPS = MMA::kThreadCount / WARP_SIZE;
+
+    using OperandA = MakeOperand<OperandA_, IteratorA_, CTA_M, CTA_K, WARPS>;
+    using OperandU = MakeOperand<OperandU_, IteratorA_, CTA_M, CTA_K, WARPS, GroupSizeU_>;
+
+    using OperandB = MakeOperand<OperandB_, IteratorB_, CTA_N, CTA_K, WARPS>;
+    using OperandV = MakeOperand<OperandV_, IteratorB_, CTA_N, CTA_K, WARPS, GroupSizeV_>;
+
+    using TransformA = TransformA_;
+    using TransformB = TransformB_;
+
+    using Ta = typename OperandA::Dtype;
+    using Tb = typename OperandB::Dtype;
+    using Tu = typename OperandU::Dtype;
+    using Tv = typename OperandV::Dtype;
+
+    using SmemLayoutA = typename OperandA::SmemLayout;
+    using SmemLayoutB = typename OperandB::SmemLayout;
+    using SmemLayoutU = typename OperandU::SmemLayout;
+    using SmemLayoutV = typename OperandV::SmemLayout;
+
+    using SmemCopyA = SmemCopy<OperandA, MMA_Map::kIterM, MMA_Map::kIterK, MMA_Map::kDeltaM, MMA_Map::kDeltaK>;
+    using SmemCopyU = SmemCopy<OperandU, MMA_Map::kIterM, MMA_Map::kIterK, MMA_Map::kDeltaM, MMA_Map::kDeltaK>;
+    using SmemCopyB = SmemCopy<OperandB, MMA_Map::kIterN, MMA_Map::kIterK, MMA_Map::kDeltaN, MMA_Map::kDeltaK>;
+    using SmemCopyV = SmemCopy<OperandV, MMA_Map::kIterN, MMA_Map::kIterK, MMA_Map::kDeltaN, MMA_Map::kDeltaK>;
+
+    using SmemAccessorA = SmemAccessor<Ta, SmemLayoutA>;
+    using SmemAccessorB = SmemAccessor<Tb, SmemLayoutB>;
+    using SmemAccessorU = SmemAccessor<Tu, SmemLayoutU>;
+    using SmemAccessorV = SmemAccessor<Tv, SmemLayoutV>;
+
+    using GmemIterA = typename OperandA::GmemIter;
+    using GmemIterB = typename OperandB::GmemIter;
+    using GmemIterU = typename OperandU::GmemIter;
+    using GmemIterV = typename OperandV::GmemIter;
+
+    static constexpr int kFusePrefetch = FusePrefetch_;
+
+    static constexpr int kMaxPrefetchIter = 1;
+    // std::min(ceil_div(std::max(GmemIterA::ITER_S, GmemIterB::ITER_S), 4), MMA::kTileIterK);
+
+    static constexpr int kBatchA = ceil_div(GmemIterA::ITER_S, kMaxPrefetchIter);
+    static constexpr int kBatchB = ceil_div(GmemIterB::ITER_S, kMaxPrefetchIter);
+    static constexpr int kBatchU = ceil_div(GmemIterU::ITER_S, kMaxPrefetchIter);
+    static constexpr int kBatchV = ceil_div(GmemIterV::ITER_S, kMaxPrefetchIter);
+
+    struct SharedStorage {
+        __align__(16) Array<Ta, Stages * SmemLayoutA::kSize> A;
+        __align__(16) Array<Tb, Stages * SmemLayoutB::kSize> B;
+        __align__(16) Array<Tu, Stages * SmemLayoutU::kSize> U;
+        __align__(16) Array<Tv, Stages * SmemLayoutV::kSize> V;
+    };
+
+    __device__ void Wait()
+    {
+        __pipeline_wait_prior(Stages - 2);
+        __syncthreads();
+    }
+
+    template<class GmemIter, class SmemIter>
+    __device__ void _advance_smem(GmemIter& gmem_iter, SmemIter& smem_iter)
+    {
+        gmem_iter.smem_data_ = smem_iter.pointer;
+        smem_iter.Advance();
+    }
+
+    // zip with
+    template<class BindingG, class BindingS>
+    __device__ void AdvanceSmemStage(BindingG& g, BindingS& s)
+    {
+        _advance_smem(g.a, s.a);
+        _advance_smem(g.b, s.b);
+        _advance_smem(g.u, s.u);
+        _advance_smem(g.v, s.v);
+    }
+
+    template<class Binding>
+    __device__ void ClearSmem(Binding& g)
+    {
+        g.a.ClearSmem();
+        g.b.ClearSmem();
+        g.u.ClearSmem();
+        g.v.ClearSmem();
+    }
+
+    template<class Binding>
+    __device__ void Prefetch(Binding& g, bool mask)
+    {
+        g.a.Prefetch(mask);
+        g.b.Prefetch(mask);
+        g.u.Prefetch(mask);
+        g.v.Prefetch(mask);
+    }
+
+    template<class Binding>
+    __device__ void Prefetch(Binding& g, int k, bool mask)
+    {
+        int batch_A = min((k + 1) * kBatchA, GmemIterA::ITER_S) - k * kBatchA;
+        int batch_B = min((k + 1) * kBatchB, GmemIterB::ITER_S) - k * kBatchB;
+        int batch_U = min((k + 1) * kBatchU, GmemIterU::ITER_S) - k * kBatchU;
+        int batch_V = min((k + 1) * kBatchV, GmemIterV::ITER_S) - k * kBatchV;
+        g.a.Prefetch(k * kBatchA, batch_A, mask);
+        g.b.Prefetch(k * kBatchB, batch_B, mask);
+        g.u.Prefetch(k * kBatchU, batch_U, mask);
+        g.v.Prefetch(k * kBatchV, batch_V, mask);
+    }
+
+    template<class Binding>
+    __device__ void AdvanceGmemStage(Binding& g)
+    {
+        g.a.Advance();
+        g.b.Advance();
+        g.u.Advance();
+        g.v.Advance();
+    }
+
+    __device__ void operator()(GmemIterA&     gmem_A,
+                               GmemIterB&     gmem_B,
+                               GmemIterU&     gmem_U,
+                               GmemIterV&     gmem_V,
+                               FragC&         frag_C,
+                               int            tile_iter,
+                               SharedStorage& storage)
+    {
+        static_assert(MMA::kAtomK == 1);
+
+        static constexpr int UU = 1;  // ceil_div(GroupSizeU_, MMA_Map::TileK);
+        static constexpr int VV = 1;  // ceil_div(GroupSizeV_, MMA_Map::TileK);
+
+        // mma_iter_x = tile_iter_x * atom_x
+        typename MMA_Atom::FragA frag_A[MMA::kTileIterK][MMA::kMmaIterM];
+        typename MMA_Atom::FragB frag_B[MMA::kTileIterK][MMA::kMmaIterN];
+
+        typename SmemCopyA::Frag data_A[MMA::kTileIterK];
+        typename SmemCopyB::Frag data_B[MMA::kTileIterK];
+        typename SmemCopyU::Frag data_U[ceil_div(MMA::kTileIterK, UU)];
+        typename SmemCopyV::Frag data_V[ceil_div(MMA::kTileIterK, VV)];
+
+        SmemIter<get_pointer_type<Ta>, SmemLayoutA::kSize, Stages> smem_A{storage.A.data()};
+        SmemIter<get_pointer_type<Tb>, SmemLayoutB::kSize, Stages> smem_B{storage.B.data()};
+        SmemIter<get_pointer_type<Tu>, SmemLayoutU::kSize, Stages> smem_U{storage.U.data()};
+        SmemIter<get_pointer_type<Tv>, SmemLayoutV::kSize, Stages> smem_V{storage.V.data()};
+
+        GroupIter<ceil_div(GroupSizeU_, CTA_K)> gmem_group_iter_U{};
+        GroupIter<ceil_div(GroupSizeV_, CTA_K)> gmem_group_iter_V{};
+
+        auto smem_group_iter_U = gmem_group_iter_U;
+        auto smem_group_iter_V = gmem_group_iter_V;
+
+        // a separate counter tends to generate better code
+        int gmem_iter = tile_iter;
+        int gmem_mask = true;
+
+        Binding gmem_iters{gmem_A, gmem_B, gmem_U, gmem_V};
+        Binding smem_iters{smem_A, smem_B, smem_U, smem_V};
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < Stages; ++i) {
+            AdvanceSmemStage(gmem_iters, smem_iters);
+            ClearSmem(gmem_iters);
+        }
+
+        // r: 0, w:s-1
+
+        __syncthreads();
+
+        auto prefetch_stage = [&] {
+            Prefetch(gmem_iters, gmem_mask);
+            __pipeline_commit();
+            AdvanceGmemStage(gmem_iters);
+            gmem_group_iter_U.Advance();
+            gmem_group_iter_V.Advance();
+            gmem_U.g_mask = (bool)gmem_group_iter_U;
+            gmem_V.g_mask = (bool)gmem_group_iter_V;
+            if (--gmem_iter == 0) {
+                gmem_mask = false;
+            }
+        };
+
+        [[maybe_unused]] auto prefetch_batch = [&](int k) {
+            Prefetch(gmem_iters, k, gmem_mask);
+            if (k == MMA::kTileIterK - 1) {
+                __pipeline_commit();
+                AdvanceGmemStage(gmem_iters);
+                gmem_group_iter_U.Advance();
+                gmem_group_iter_V.Advance();
+                gmem_U.g_mask = (bool)gmem_group_iter_U;
+                gmem_V.g_mask = (bool)gmem_group_iter_V;
+                if (--gmem_iter == 0) {
+                    gmem_mask = false;
+                }
+            }
+        };
+
+        auto advance_and_wait_smem_stage = [&] {
+            Wait();
+            AdvanceSmemStage(gmem_iters, smem_iters);
+        };
+
+        const int3 offset_mnk = MMA::get_offset(threadIdx.x);
+        const int  offset_m   = offset_mnk.x;
+        const int  offset_n   = offset_mnk.y;
+        const int  offset_k   = offset_mnk.z;
+
+        SmemCopyA smem_copy_A{{offset_m, offset_k}};
+        SmemCopyU smem_copy_U{{offset_m, offset_k}};
+        SmemCopyB smem_copy_B{{offset_n, offset_k}};
+        SmemCopyV smem_copy_V{{offset_n, offset_k}};
+
+        auto preload = [&](int k) {
+            smem_copy_A(smem_A.pointer, data_A[k], k);
+            smem_copy_U(smem_U.pointer, data_U[k / UU], k, k % UU == 0 && (bool)smem_group_iter_U);
+
+            smem_copy_B(smem_B.pointer, data_B[k], k);
+            smem_copy_V(smem_V.pointer, data_V[k / VV], k, k % VV == 0 && (bool)smem_group_iter_V);
+        };
+
+        PRAGMA_UNROLL
+        for (int stage = 0; stage < Stages - 1; ++stage) {
+            AdvanceSmemStage(gmem_iters, smem_iters);
+            prefetch_stage();
+        }
+        // r:-1, w:-2
+
+        advance_and_wait_smem_stage();
+        // r: 0, w:-1
+
+        preload(0);
+
+        TransformA::apply(frag_A, 0, data_A, data_U, UU);
+        TransformB::apply(frag_B, 0, data_B, data_V, VV);
+
+        if constexpr (kFusePrefetch) {
+            prefetch_batch(0);
+        }
+
+        PRAGMA_NO_UNROLL
+        for (; tile_iter > 0; --tile_iter) {
+            if constexpr (!kFusePrefetch) {
+                prefetch_stage();
+            }
+            constexpr int ITER_K = MMA::kTileIterK;
+            static_assert(ITER_K > 1);
+
+            PRAGMA_UNROLL
+            for (int k = 0; k < ITER_K; ++k) {
+                // preload for next iter
+                preload((k + 1) % ITER_K);
+                PRAGMA_UNROLL
+                for (int n = 0; n < MMA::kMmaIterN; ++n) {
+                    PRAGMA_UNROLL
+                    for (int m = 0; m < MMA::kMmaIterM; ++m) {
+                        int mm = n % 2 ? (MMA::kMmaIterM - m - 1) : m;
+                        MMA_Atom::fma(frag_C[mm][n], frag_A[k][mm], frag_B[k][n], frag_C[mm][n]);
+                    }
+                }
+                // PRAGMA_UNROLL
+                // for (int m = 0; m < MMA::kMmaIterM; ++m) {
+                //     PRAGMA_UNROLL
+                //     for (int n = 0; n < MMA::kMmaIterN; ++n) {
+                //         int nn = n;
+                //         int mm = m;
+                //         MMA_Atom::fma(frag_C[mm][nn], frag_A[k][mm], frag_B[k][nn], frag_C[mm][nn]);
+                //     }
+                // }
+                if constexpr (kFusePrefetch) {
+                    prefetch_batch((k + 1) % ITER_K);
+                }
+                if (k + 1 == ITER_K - 1) {
+                    advance_and_wait_smem_stage();
+                    smem_group_iter_U.Advance();
+                    smem_group_iter_V.Advance();
+                }
+                TransformA::apply(frag_A, (k + 1) % ITER_K, data_A, data_U, UU);
+                TransformB::apply(frag_B, (k + 1) % ITER_K, data_B, data_V, VV);
+            }
+        }
+
+        __pipeline_commit();
+        __pipeline_wait_prior(0);
+
+        __syncthreads();
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/operand.h b/src/turbomind/kernels/gemm/operand.h
new file mode 100644
index 0000000000..41fc2b48a9
--- /dev/null
+++ b/src/turbomind/kernels/gemm/operand.h
@@ -0,0 +1,66 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/iterator.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+struct VoidOperand {
+    using Dtype = int;
+
+    static constexpr Pack  kPack  = 0;
+    static constexpr Order kOrder = Order::kColMajor;
+
+    struct GetSmemLayout {
+        static constexpr SmemLayoutV2<1, 1> apply(...)
+        {
+            return {};
+        }
+    };
+
+    using SmemCopyAtom = VoidSmemCopyAtom;
+
+    struct GetGmemIter {
+        static constexpr auto apply(...)
+        {
+            return type_c<VoidGmemIter>;
+        }
+    };
+};
+
+/// TODO: fix AlignC, AlignS
+/// TODO: fix GroupSize
+template<class Operand, class Iterator, int M, int K, int WARPS, int GroupSize = 1>
+struct MakeOperand {
+
+    using Dtype = typename Operand::Dtype;
+
+    static constexpr Pack  kPack      = Operand::kPack;
+    static constexpr Order kOrder     = Operand::kOrder;
+    static constexpr int   kGroupSize = GroupSize;
+
+    static constexpr int2 kPackMK = Packing_v2<kPack, kOrder>::apply({M, ceil_div(K, kGroupSize)});
+
+    static constexpr pair<kPackMK.x, kPackMK.y> kShapeMK{};
+
+    using SmemLayout   = decltype(Operand::GetSmemLayout::apply(kShapeMK));
+    using SmemAccessor = SmemAccessorV2<Dtype, SmemLayout, kOrder>;
+
+    using GmemIter = typename decltype(Operand::GetGmemIter::apply(
+        type_c<Operand>, type_c<Iterator>, type_c<SmemLayout>, kShapeMK, constant<WARPS>{}))::type;
+
+    using SmemCopyAtom = typename Operand::SmemCopyAtom;
+};
+
+// CPO for getting specific operand templates
+template<MMA_Tag mma, Op_Tag optag, class T, Order order, bool is_pack, class SFINAE = void>
+struct GetOperand: std::false_type {
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/predicate.h b/src/turbomind/kernels/gemm/predicate.h
new file mode 100644
index 0000000000..d5811682b8
--- /dev/null
+++ b/src/turbomind/kernels/gemm/predicate.h
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace turbomind::gemm {
+
+template<int S, int C, bool AlignedS, bool AlignedC>
+struct Predicate {
+
+    static constexpr int kSizeC = AlignedC ? 1 : C;
+
+    static_assert(S * kSizeC <= 32);
+
+    static constexpr bool is_active = true;
+
+    uint32_t pred_{};
+
+    __device__ int operator()(int s, int c) const
+    {
+        return (pred_ & (1 << (s * kSizeC + c))) != 0;
+    }
+
+    __device__ void set(int s, int c)
+    {
+        pred_ |= (1 << (s * kSizeC + c));
+    }
+
+    __device__ void clear()
+    {
+        pred_ = 0;
+    }
+};
+
+template<int S, int C>
+struct Predicate<S, C, true, true> {
+
+    static constexpr bool is_active = false;
+
+    __device__ constexpr std::integral_constant<int, 1> operator()(int, int) const
+    {
+        return {};
+    }
+
+    __device__ void set(int, int) {}
+
+    __device__ void clear()
+    {
+        // pred_ = 0;
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/registry.cu b/src/turbomind/kernels/gemm/registry.cu
new file mode 100644
index 0000000000..da3d2923c5
--- /dev/null
+++ b/src/turbomind/kernels/gemm/registry.cu
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/arch.h"
+#include "src/turbomind/kernels/gemm/registry.h"
+
+namespace turbomind::gemm {
+
+Registry::Registry(std::shared_ptr<cudaDeviceProp> device_prop):
+    device_prop_{std::move(device_prop)}, arch_{device_prop_->major * 100 + device_prop_->minor * 10}
+{
+    f16_u4g128_f16_tnt_sm70_s884();
+    f16_u4g128_f16_tnt_sm75_simt();
+    f16_u4g128_f16_tnt_sm75_s16816();
+    f16_u4g128_f16_tnt_sm80_s16816();
+    f16_u4g128_f16_tnt_sm90_s16816();
+
+    u4g128_f16_f16_nnn_sm80_s16816();
+}
+
+bool Registry::Add(std::unique_ptr<Kernel> kernel)
+{
+    if (!is_arch_compatible(kernel->arch(), arch_)) {
+        return false;
+    }
+    if ((int)device_prop_->sharedMemPerBlockOptin < kernel->smem_size()) {
+        return false;
+    }
+    // std::cout << "register: " << kernel->name()                                        //
+    //           << ", shared: " << (kernel->smem_size() >> 10) << " KB"                  //
+    //           << ", regs: " << kernel->desc().attr.numRegs                             //
+    //           << ", local: " << (float)kernel->desc().attr.localSizeBytes << " bytes"  //
+    //           << ", max_active_ctas: " << kernel->desc().max_active_ctas << " \n";
+
+    kernels_.push_back(std::move(kernel));
+    ptrs_.push_back(kernels_.back().get());
+    return true;
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/registry.h b/src/turbomind/kernels/gemm/registry.h
new file mode 100644
index 0000000000..401325cdda
--- /dev/null
+++ b/src/turbomind/kernels/gemm/registry.h
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/kernel_impl.h"
+#include <memory>
+
+namespace turbomind::gemm {
+
+class Registry {
+public:
+    explicit Registry(std::shared_ptr<cudaDeviceProp> device_prop);
+
+    template<class Config>
+    [[maybe_unused]] bool Add()
+    {
+        return Add(std::make_unique<KernelImpl<typename Config::Kernel>>());
+    }
+
+    [[nodiscard]] const std::vector<Kernel*>& kernels() const
+    {
+        return ptrs_;
+    }
+
+private:
+    bool Add(std::unique_ptr<Kernel> kernel);
+
+    void f16_u4g128_f16_tnt_sm70_s884();
+    void f16_u4g128_f16_tnt_sm75_simt();
+    void f16_u4g128_f16_tnt_sm75_s16816();
+    void f16_u4g128_f16_tnt_sm80_s16816();
+    void f16_u4g128_f16_tnt_sm90_s16816();
+
+    void u4g128_f16_f16_nnn_sm80_s16816();
+
+private:
+    std::shared_ptr<cudaDeviceProp>      device_prop_;
+    int                                  arch_;
+    std::vector<std::unique_ptr<Kernel>> kernels_;
+    std::vector<Kernel*>                 ptrs_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/simt.h b/src/turbomind/kernels/gemm/simt.h
new file mode 100644
index 0000000000..7ec9594c90
--- /dev/null
+++ b/src/turbomind/kernels/gemm/simt.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind::gemm::simt {
+
+// constexpr int OP_M = 2;
+// constexpr int OP_N = 16;
+// constexpr int OP_K = 4;
+
+// constexpr int OP_M = 4;
+// constexpr int OP_N = 8;
+// constexpr int OP_K = 8;
+
+constexpr int OP_M = 1;
+constexpr int OP_N = 32;
+constexpr int OP_K = 8;
+
+}  // namespace turbomind::gemm::simt
diff --git a/src/turbomind/kernels/gemm/smem_copy.h b/src/turbomind/kernels/gemm/smem_copy.h
new file mode 100644
index 0000000000..de08d77765
--- /dev/null
+++ b/src/turbomind/kernels/gemm/smem_copy.h
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include "src/turbomind/kernels/core/layout.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+struct VoidSmemCopyAtom {
+
+    static constexpr int M = 1;
+    static constexpr int K = 1;
+
+    static constexpr int kFragNum = 1;
+
+    using Frag = Array<int, 1>;
+
+    template<class S, class D>
+    __device__ static void copy(S, D, bool)
+    {
+    }
+
+    __device__ static int2 get_offset(int)
+    {
+        return {};
+    }
+
+    __device__ static int2 unique(int thread_idx, int pack_idx)
+    {
+        return {};
+    }
+};
+
+template<class T, class Layout, Order order>
+struct SmemAccessorV2 {
+};
+
+template<class T, class Layout>
+struct SmemAccessorV2<T, Layout, kRowMajor>: SmemAccessor<T, Layout> {
+    using SmemAccessor<T, Layout>::SmemAccessor;
+};
+
+template<class T, class Layout>
+struct SmemAccessorV2<T, Layout, kColMajor> {
+    SmemAccessor<T, Layout> base_;
+
+    __device__ SmemAccessorV2(get_pointer_type<T> ptr): base_{ptr} {}
+    __device__ T& operator()(int m, int k)
+    {
+        return base_(k, m);
+    }
+};
+
+template<class T, Order order, int M_, int K_, int FragSize, int FragNum_, int RepeatC = 1>
+struct SmemCopyAtom_Pack_v2 {
+    static constexpr int M = M_;
+    static constexpr int K = K_;
+
+    static constexpr int kFragNum = FragNum_;
+
+    using Frag = Array<T, FragSize * kFragNum>;
+
+    __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
+    {
+        const int lane_id = thread_idx % WARP_SIZE;
+
+        const int c = lane_id / RepeatC * Frag::size();
+
+        return order == kRowMajor ? int2{0, c} : int2{c, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S src_ptr, D dst_ptr, bool mask)
+    {
+        auto dst_raw_ptr = (T*)dst_ptr;  // SubBytePtr<T> -> T*
+        if (mask) {
+            Lds(*(Frag*)dst_raw_ptr, src_ptr);
+        }
+    }
+};
+
+template<class T, class CopyAtom, Order order, int FragNum_>
+struct SmemCopyAtom_Pack_v3 {
+    static constexpr int M = CopyAtom::M * FragNum_;
+    static constexpr int K = CopyAtom::K;
+
+    static constexpr int kFragNum = FragNum_;
+
+    using Frag = Array<T, CopyAtom::Frag::size() * kFragNum>;
+
+    __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
+    {
+        const int c = CopyAtom::unique(thread_idx, 0).x * Frag::size();
+
+        return order == kRowMajor ? int2{0, c} : int2{c, 0};
+    }
+
+    template<class S, class D>
+    __device__ static void copy(S src_ptr, D dst_ptr, bool mask)
+    {
+        if (mask) {
+            auto dst_raw_ptr = (T*)dst_ptr;  // SubBytePtr<T> -> T*
+            Lds(*(Frag*)dst_raw_ptr, src_ptr);
+        }
+    }
+};
+
+template<class Operand, int iM, int iK, int dM, int dK>
+struct SmemCopy {
+    using Atom = typename Operand::SmemCopyAtom;
+
+    static constexpr int kFragNum = Atom::kFragNum;
+
+    static constexpr int ITER_M = iM / Atom::kFragNum;
+
+    static_assert(ITER_M > 0);
+
+    using Frag = typename Atom::Frag[ITER_M];
+
+    using Pack = Packing_v2<Operand::kPack, Operand::kOrder>;
+
+    static constexpr int2 delta = Pack::apply(int2{dM * kFragNum, dK});
+
+    using Layout = typename Operand::SmemLayout;
+
+    static constexpr int2 kMK0 = cs2mk<Operand::kOrder>(Layout::C0, Layout::S0);
+
+    static constexpr int kPeriodM = ceil_div(kMK0.x, delta.x);
+    static constexpr int kPeriodK = ceil_div(kMK0.y, delta.y);
+
+    const int2 offset_;
+
+    int phases_[kPeriodK][kPeriodM];
+
+    __device__ SmemCopy(int2 offset): offset_{offset}
+    {
+        const int2 thr = Atom::get_offset(threadIdx.x);
+        PRAGMA_UNROLL
+        for (int k = 0; k < kPeriodK; ++k) {
+            PRAGMA_UNROLL
+            for (int m = 0; m < kPeriodM; ++m) {
+                const int2 pack = Pack::apply({offset.x + m * dM * kFragNum, offset.y + k * dK});
+                const int2 cs   = mk2cs<Operand::kOrder>({pack.x + thr.x, pack.y + thr.y});
+                phases_[k][m]   = Layout::apply(cs.y, cs.x);
+            }
+        }
+    }
+
+    template<class Pointer>
+    __device__ void operator()(Pointer src_ptr, Frag& dst, int k, bool mask = true)
+    {
+        using Accessor = typename Operand::SmemAccessor;
+        if constexpr (Operand::kGroupSize == 1) {
+            PRAGMA_UNROLL
+            for (int m = 0; m < ITER_M; ++m) {
+                const int  mm = m / kPeriodM * kPeriodM * dM * kFragNum;
+                const int  kk = k / kPeriodK * kPeriodK * dK;
+                const int2 cs = mk2cs<Operand::kOrder>(Pack::apply(int2{mm, kk}));
+                const int  i0 = Layout::apply(cs.y, cs.x);
+                const int  i1 = phases_[k % kPeriodK][m % kPeriodM];
+                Atom::copy(&src_ptr[i0 + i1], dst[m].data(), mask);
+            }
+        }
+        else {  // generic case
+            Accessor   smem{src_ptr};
+            const int2 thr = Atom::get_offset(threadIdx.x);
+            PRAGMA_UNROLL
+            for (int m = 0; m < ITER_M; ++m) {
+                const int  mm = offset_.x + m * dM * kFragNum;
+                const int  kk = offset_.y + k * dK;  // Note: this forbids sub-tile group sizes
+                const int2 mk = Pack::apply(int2{mm, kk / Operand::kGroupSize});
+                Atom::copy(&smem(mk.x + thr.x, mk.y + thr.y), dst[m].data(), mask);
+            }
+        }
+        // else if constexpr (Operand::kPack != 0 && Operand::kGroupSize != 1) {  // group size = 1, pack != 0
+        //     const int  mask_k = Operand::kGroupSize == 1;
+        //     const int2 pack   = Pack::apply(int2{offset_.x, offset_.y});
+        //     const int2 thr    = Atom::get_offset(threadIdx.x);
+        //     const int2 cs     = mk2cs<Operand::kOrder>({pack.x + thr.x, (pack.y + thr.y) * mask_k});
+        //     auto       smem   = src_ptr + Layout::apply(cs.y, cs.x);
+        //     PRAGMA_UNROLL
+        //     for (int m = 0; m < ITER_M; ++m) {
+        //         const int  mm  = m * dM * kFragNum;
+        //         const int  kk  = k * dK;
+        //         const int2 cs  = mk2cs<Operand::kOrder>(Pack::apply(int2{mm, kk * mask_k}));
+        //         const int  idx = Layout::apply(cs.y, cs.x);
+        //         Atom::copy(&smem[idx], dst[m].data(), mask);
+        //     }
+        // }
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/test/gemm_bench.cu b/src/turbomind/kernels/gemm/test/gemm_bench.cu
new file mode 100644
index 0000000000..3295d2e1a6
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/gemm_bench.cu
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "nvbench/main.cuh"
+#include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/test/testbed.h"
+#include <map>
+#include <nvbench/nvbench.cuh>
+#include <string>
+
+std::vector<std::pair<int64_t, int64_t>> config{
+    {11008 * 2, 4096}, {4096, 11008}, {12288, 4096}, {4096, 4096},  // llama2-7b
+    {14336 * 2, 4096}, {4096, 14336}, {6144, 4096},  {4096, 4096},  // llama3-8b / internlm2.5-7b
+    {16384 * 2, 6144}, {6144, 16384}, {8192, 6144},  {6144, 6144},  // internlm2-20b
+    {13696 * 2, 4096}, {4096, 13696}, {4608, 4096},  {4096, 4096},  // glm4-9b
+    {18944 * 2, 3584}, {3584, 18944}, {4608, 3584},  {3584, 3584},  // qwen2-7b
+    {20480 * 2, 7168}, {7168, 20480}, {9216, 7168},  {7168, 7168},  // yi-34b
+    {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192},  // llama2-70b / llama3-70b
+    {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192}   // qwen2-72b-instruct-awq
+};
+
+// {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192},  // qwen2-72b
+
+void gemm_bench(nvbench::state& state)
+{
+    const auto idx = state.get_int64("idx");
+
+    const auto bs = state.get_int64("bs");
+    const auto tp = state.get_int64("tp");
+
+    auto [output_dims, input_dims] = config[idx];
+
+    constexpr int group_size = 128;
+
+    if (idx % 4 == 0 || idx % 4 == 2) {
+        if (output_dims % tp)
+            return;
+        output_dims /= tp;
+    }
+    else {
+        if (input_dims % tp)
+            return;
+        input_dims /= tp;
+    }
+
+    if (input_dims % group_size)
+        return;
+
+    using turbomind::gemm::get_test;
+
+    {
+        int m = bs;
+        int n = output_dims;
+        int k = input_dims;
+        if (get_test().kBatchDim == 1) {
+            std::swap(m, n);
+        }
+        std::cerr << "m" << m << "n" << n << "k" << k << "\n";
+        get_test().Initialize(m, n, k, group_size, state.get_cuda_stream());
+    }
+
+    state.add_element_count((size_t)bs * output_dims * input_dims * 2);  // mul + add
+
+    // state.collect_dram_throughput();
+    // state.collect_l2_hit_rates();
+
+    if constexpr (1) {
+        state.add_global_memory_reads(get_test().global_memory_reads());
+        get_test().Run();
+        state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
+            get_test().Run();
+        });
+    }
+    else {
+        state.add_global_memory_reads(sizeof(half) * (bs * input_dims + output_dims * input_dims));
+        state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
+            get_test().RunCublas();
+        });
+    }
+}
+
+NVBENCH_BENCH(gemm_bench)
+    .add_int64_axis("idx", nvbench::range(0, (int)config.size() - 1))
+    .add_int64_power_of_two_axis("bs", nvbench::range(0, 10))
+    .add_int64_axis("tp", {1, 2, 4});
+
+int main(int argc, char* argv[])
+{
+    NVBENCH_MAIN_BODY(argc, argv);
+}
diff --git a/src/turbomind/kernels/gemm/test/gemm_test.cu b/src/turbomind/kernels/gemm/test/gemm_test.cu
new file mode 100644
index 0000000000..224d61f193
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/gemm_test.cu
@@ -0,0 +1,68 @@
+
+#include "src/turbomind/kernels/attention/quantization.h"
+
+#include "src/turbomind/kernels/gemm/convert_v2.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/gpu_metric.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/test/quantization.h"
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
+#include "src/turbomind/kernels/gemm/test/testbed.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include <fstream>
+#include <limits>
+#include <thrust/universal_vector.h>
+
+#include <type_traits>
+
+using namespace turbomind;
+using namespace gemm;
+using thrust::universal_vector;
+
+cublasHandle_t cublas_handle{};
+
+void ComputeRefCpu(half* C, const half* A, const half* B, int m, int n, int k)
+{
+    for (int mm = 0; mm < m; ++mm) {
+        for (int nn = 0; nn < n; ++nn) {
+            float c = 0;
+            for (int kk = 0; kk < k; ++kk) {
+                c += (float)A[mm * k + kk] * (float)B[nn * k + kk];
+            }
+            C[mm * n + nn] = c;
+        }
+    }
+}
+
+void Run(int batch_size, int output_dims, int input_dims, int g = 128)
+{
+    auto& test = get_test();
+    int   m    = batch_size;
+    int   n    = output_dims;
+    int   k    = input_dims;
+    if (get_test().kBatchDim == 1) {
+        std::swap(m, n);
+    }
+    std::cerr << "m" << m << "n" << n << "k" << k << "\n";
+    test.Initialize(m, n, k, g, 0);
+
+    for (int i = 0; i < 10; ++i) {
+        test.Run();
+    }
+
+    // test.CompareB();
+    test.CompareC();
+
+    return;
+}
+
+int main(int argc, char* argv[])
+{
+    Run(16384, 16384, 16384);
+
+    if (auto ec = cudaDeviceSynchronize(); ec != cudaSuccess) {
+        std::cerr << "un-clean exit: " << cudaGetErrorString(ec) << "\n";
+    }
+
+    return 0;
+}
diff --git a/src/turbomind/kernels/gemm/test/quantization.cu b/src/turbomind/kernels/gemm/test/quantization.cu
new file mode 100644
index 0000000000..3b4d200dab
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/quantization.cu
@@ -0,0 +1,27 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/test/quantization_impl.h"
+
+namespace turbomind::gemm {
+
+template void Quantize<uint4_t>(const thrust::universal_vector<half>& x,
+                                int                                   m,
+                                int                                   k,
+                                Order                                 order,
+                                int                                   group_size,
+                                thrust::universal_vector<half>&       x_p,  // pseudo-quantized
+                                thrust::universal_vector<uint16_t>&   x_q,  // quantized ushort
+                                thrust::universal_vector<half>&       x_u,  // scales & zeros (always m-major)
+                                cudaStream_t                          stream);
+
+template void Quantize<uint8_t>(const thrust::universal_vector<half>& x,
+                                int                                   m,
+                                int                                   k,
+                                Order                                 order,
+                                int                                   group_size,
+                                thrust::universal_vector<half>&       x_p,  // pseudo-quantized
+                                thrust::universal_vector<uint16_t>&   x_q,  // quantized ushort
+                                thrust::universal_vector<half>&       x_u,  // scales & zeros (always m-major)
+                                cudaStream_t                          stream);
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/test/quantization.h b/src/turbomind/kernels/gemm/test/quantization.h
new file mode 100644
index 0000000000..cddfa82a0b
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/quantization.h
@@ -0,0 +1,22 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/types.h"
+#include <thrust/device_vector.h>
+#include <thrust/universal_vector.h>
+
+#pragma once
+
+namespace turbomind::gemm {
+
+template<class D, class S>
+void Quantize(const thrust::universal_vector<S>&  x,
+              int                                 m,
+              int                                 k,
+              Order                               order,
+              int                                 group_size,
+              thrust::universal_vector<S>&        x_p,  // pseudo-quantized
+              thrust::universal_vector<uint16_t>& x_q,  // quantized ushort
+              thrust::universal_vector<S>&        x_u,  // scales & zeros (always m-major)
+              cudaStream_t                        stream);
+
+}
diff --git a/src/turbomind/kernels/gemm/test/quantization_impl.h b/src/turbomind/kernels/gemm/test/quantization_impl.h
new file mode 100644
index 0000000000..992e5a3c3b
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/quantization_impl.h
@@ -0,0 +1,211 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/attention/quantization.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/universal_vector.h>
+
+namespace turbomind::gemm {
+
+// quantize using `scale` and `zeros`,
+template<class T>
+__global__ void find_stats(Array<T, 2>* minmax, const T* src, int N, int K, int G)
+{
+    int n_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int k_idx = blockIdx.y;
+
+    if (n_idx >= N || k_idx * G >= K) {
+        return;
+    }
+
+    float minval = std::numeric_limits<float>::infinity();
+    float maxval = -minval;
+
+    const int L = min(K, G);
+
+    for (int k = 0; k < L; k += 8) {
+        Array<T, 8> vec;
+        Load(vec, &src[n_idx * K + k_idx * G + k]);
+        PRAGMA_UNROLL
+        for (int i = 0; i < vec.size(); ++i) {
+            minval = __hmin(minval, vec[i]);
+            maxval = __hmax(maxval, vec[i]);
+        }
+    }
+
+    // store in n-major
+    Store(minmax[k_idx * N + n_idx].data(), Array<T, 2>{minval, maxval});
+}
+
+template<class Q, bool asym, class T>
+__global__ void find_params(T* param, const Array<T, 2>* minmax, int count)
+{
+    int global_idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (global_idx >= count) {
+        return;
+    }
+    auto        stats     = minmax[global_idx];
+    const float inv_q_max = fdividef(1.f, (1 << bitsof<Q>)-1);
+
+    static_assert(asym);
+
+    float scale = (T)(((float)stats[1] - (float)stats[0]) * inv_q_max);
+
+    // force trivial scale / zero for debugging
+    if constexpr (0) {
+        stats[0] = 0;
+        scale    = 1.f;
+    }
+
+    Store(param + global_idx * 2, Array<T, 2>{scale, stats[0]});
+}
+
+template<class Q, class T>
+__global__ void quantize(uint16_t* dst, T* pseudo, const T* src, const T* stats, int N, int K, int G)
+{
+    static_assert(bitsof<Q> <= 16);
+    static_assert(bitsof<T> == 16);  // fp16 & bf16
+
+    int n_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int k_idx = blockIdx.y;
+
+    if (n_idx >= N || k_idx * G >= K) {
+        return;
+    }
+
+    Array<T, 2> param;
+    Load(param, stats + (k_idx * N + n_idx) * 2);
+
+    float inv_scale = fdividef(1.f, param[0]);
+
+    const int L = min(K, G);
+
+    for (int k = 0; k < L; k += 8) {
+        Array<T, 8>        vi;
+        Array<uint16_t, 8> vo;
+        Load(vi, &src[n_idx * K + k_idx * G + k]);
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < 8; ++i) {
+            float u = (static_cast<float>(vi[i] - param[1])) * inv_scale;
+            vo[i]   = quant<uint16_t>(u, bitsof<Q>);
+        }
+        Store(&dst[n_idx * K + k_idx * G + k], vo);
+
+        if (pseudo) {
+            Array<T, 8> vf;
+            PRAGMA_UNROLL
+            for (int i = 0; i < 8; ++i) {
+                vf[i] = __hfma(static_cast<T>(vo[i]), param[0], param[1]);
+            }
+            Store(&pseudo[n_idx * K + k_idx * G + k], vf);
+        }
+    }
+}
+
+template<class T>
+__global__ void transpose(const T* src, T* dst, int s, int c)
+{
+    const int cid = threadIdx.x + blockIdx.x * blockDim.x;
+    const int sid = threadIdx.y + blockIdx.y * blockDim.y;
+    if (sid < s && cid < c) {
+        dst[cid * s + sid] = src[sid * c + cid];
+    }
+}
+
+template<class T>
+void invokeTranspose(const T* src, T* dst, int s, int c, cudaStream_t stream)
+{
+    const dim3 block{32, 16};
+    const dim3 grid(ceil_div<int>(c, block.x), ceil_div<int>(s, block.y));
+
+    transpose<<<grid, block, 0, stream>>>(src, dst, s, c);
+}
+
+template<class D, class S>
+void Quantize(const thrust::universal_vector<S>&  x,
+              int                                 m,
+              int                                 k,
+              Order                               order,
+              int                                 group_size,
+              thrust::universal_vector<S>&        x_p,  // pseudo-quantized
+              thrust::universal_vector<uint16_t>& x_q,  // quantized ushort
+              thrust::universal_vector<S>&        x_u,  // scales & zeros (always m-major)
+              cudaStream_t                        stream)
+
+{
+    auto policy = thrust::device.on(stream);
+
+    thrust::universal_vector<S>           _x(x.size());
+    thrust::universal_vector<S>           _x_p(x.size());
+    thrust::universal_vector<uint16_t>    _x_q(x.size());
+    thrust::universal_vector<Array<S, 2>> stats(ceil_div(k, group_size) * m);
+
+    x_p.resize(x.size());
+    x_q.resize(x.size());
+    /// FIXME: correct the size
+    x_u.resize(stats.size() * 2);
+
+    if (order == Order::kRowMajor) {
+        thrust::copy(policy, x.begin(), x.end(), _x.begin());
+    }
+    else {
+        invokeTranspose(x.data().get(), _x.data().get(), k, m, stream);
+    }
+
+    const int  block = std::min(256, m);
+    const dim3 grid(ceil_div(m, block), ceil_div(k, group_size));
+
+    find_stats<<<grid, block, 0, stream>>>(stats.data().get(),  //
+                                           _x.data().get(),
+                                           m,
+                                           k,
+                                           group_size);
+
+    find_params<D, true><<<ceil_div<int>(stats.size(), 256), 256, 0, stream>>>(  //
+        x_u.data().get(),
+        stats.data().get(),
+        stats.size());
+
+    quantize<D><<<grid, block, 0, stream>>>(_x_q.data().get(),  //
+                                            _x_p.data().get(),
+                                            _x.data().get(),
+                                            x_u.data().get(),
+                                            m,
+                                            k,
+                                            group_size);
+
+    if (order == Order::kRowMajor) {
+        thrust::copy(policy, _x_p.begin(), _x_p.end(), x_p.begin());
+        thrust::copy(policy, _x_q.begin(), _x_q.end(), x_q.begin());
+    }
+    else {
+        invokeTranspose(_x_p.data().get(), x_p.data().get(), m, k, stream);
+        invokeTranspose(_x_q.data().get(), x_q.data().get(), m, k, stream);
+    }
+
+    cudaStreamSynchronize(stream);
+
+    Compare(_x_p.data().get(), _x.data().get(), k, k, m);
+
+    const int kg = ceil_div(k, group_size);
+    for (int i = 0; i < m * kg; ++i) {
+        // int mi = i % m;
+        // int ki = i / m;
+
+        // x_u[i * 2]     = i;
+        // x_u[i * 2 + 1] = i;
+
+        // x_u[i * 2]     = i * 2;
+        // x_u[i * 2 + 1] = i * 2 + 1;
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/test/reference.cu b/src/turbomind/kernels/gemm/test/reference.cu
new file mode 100644
index 0000000000..591d8e6bc6
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/reference.cu
@@ -0,0 +1,109 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/test/reference.h"
+#include <cstdio>
+
+namespace turbomind::gemm {
+
+#define CHECK(cond)                                                                                                    \
+    do {                                                                                                               \
+        if (!(cond)) {                                                                                                 \
+            fprintf(stderr, "*** Check failed: (%s) @ %s:%d\n", #cond, __FILE__, __LINE__);                            \
+            std::abort();                                                                                              \
+        }                                                                                                              \
+    } while (0)
+
+namespace {
+
+MatrixLayout transpose(MatrixLayout x)
+{
+    std::swap(x.rows, x.cols);
+    x.order = x.order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
+    return x;
+}
+
+cudaDataType to_cuda_dtype(DataType dtype)
+{
+    switch (dtype) {
+        case DataType::F16:
+            return CUDA_R_16F;
+        case DataType::BF16:
+            return CUDA_R_16BF;
+        default:
+            CHECK("unsupported data type" && 0);
+    }
+    return {};
+}
+
+}  // namespace
+
+Reference::Reference()
+{
+    cublasCreate(&handle_);
+}
+
+Reference::~Reference()
+{
+    if (handle_) {
+        cublasDestroy(handle_);
+        handle_ = {};
+    }
+}
+
+void Reference::set_stream(cudaStream_t stream)
+{
+    cublasSetStream(handle_, stream);
+}
+
+void Reference::gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc)
+{
+
+    // Transpose the problem for C to be column major
+    if (Cdesc.order == Order::kRowMajor) {
+        std::swap(A, B);
+        std::swap(Adesc, Bdesc);
+        Adesc = transpose(Adesc);
+        Bdesc = transpose(Bdesc);
+        Cdesc = transpose(Cdesc);
+        // (n, k) (k, m)
+    }
+
+    CHECK(Adesc.cols == Bdesc.rows);
+
+    // (m, k) (k, n)
+    int m = Cdesc.rows;
+    int n = Cdesc.cols;
+    int k = Adesc.cols;
+    CHECK(Adesc.rows == m);
+    CHECK(Bdesc.cols == n);
+    CHECK(Bdesc.rows == k);
+
+    float alpha = 1.f;
+    float beta  = 0.f;
+
+    auto to_cublas_op = [](Order o) { return o == Order::kColMajor ? CUBLAS_OP_N : CUBLAS_OP_T; };
+
+    auto status = cublasGemmEx(handle_,
+                               to_cublas_op(Adesc.order),
+                               to_cublas_op(Bdesc.order),
+                               m,
+                               n,
+                               k,
+                               &alpha,
+                               A,
+                               to_cuda_dtype(Adesc.type),
+                               Adesc.ld,
+                               B,
+                               to_cuda_dtype(Bdesc.type),
+                               Bdesc.ld,
+                               &beta,
+                               C,
+                               to_cuda_dtype(Cdesc.type),
+                               Cdesc.ld,
+                               CUBLAS_COMPUTE_32F,
+                               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+
+    CHECK(status == CUBLAS_STATUS_SUCCESS);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/test/reference.h b/src/turbomind/kernels/gemm/test/reference.h
new file mode 100644
index 0000000000..cc1cd7f7c5
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/reference.h
@@ -0,0 +1,24 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/types.h"
+
+#include <cublas_v2.h>
+
+namespace turbomind::gemm {
+
+class Reference {
+public:
+    Reference();
+    ~Reference();
+
+    void set_stream(cudaStream_t stream);
+
+    void gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc);
+
+private:
+    cublasHandle_t handle_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/test/test_utils.cu b/src/turbomind/kernels/gemm/test/test_utils.cu
new file mode 100644
index 0000000000..f6e3915fa6
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/test_utils.cu
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
+#include <cublas_v2.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <fstream>
+#include <iostream>
+
+#define _CG_ABI_EXPERIMENTAL
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+#include <cooperative_groups/reduce.h>
+
+namespace turbomind {
+
+cublasHandle_t cublas_handle{};
+cudaStream_t   cublas_stream{};
+
+template<typename T>
+void Compare(const T* src, const T* ref, size_t stride, int dims, int bsz, bool show, float rtol, float atol)
+{
+    float asums{};
+    float rsums{};
+    int   outliers{};
+    for (int nn = 0; nn < bsz; ++nn) {
+        float abs_diff_sum{};
+        float rel_diff_sum{};
+        for (int mm = 0; mm < dims; ++mm) {
+            auto x = float(src[nn * stride + mm]);
+            auto y = float(ref[nn * stride + mm]);
+            // if (show) {
+            //     std::cout << x << "\t" << y << std::endl;
+            // }
+            auto abs_diff = std::abs(x - y);
+            auto rel_diff = abs_diff / std::abs(y + 1e-6f);
+            if (!(abs_diff <= atol + rtol * std::abs(y))) {
+                ++outliers;
+                if (show) {
+                    std::cout << nn << "," << mm << "\t" << x << "\t" << y << std::endl;
+                }
+            }
+            abs_diff_sum += abs_diff;
+            rel_diff_sum += rel_diff;
+        }
+        asums += abs_diff_sum / dims;
+        rsums += rel_diff_sum / dims;
+    }
+    std::cout << "abs_diff = " << asums / bsz << " rel_diff = " << rsums / bsz
+              << " outliers = " << outliers / (float)bsz << std::endl;
+}
+
+template void
+Compare(const half* src, const half* ref, size_t stride, int dims, int bsz, bool show, float rtol, float atol);
+template void
+Compare(const float* src, const float* ref, size_t stride, int dims, int bsz, bool show, float rtol, float atol);
+#if ENABLE_BF16
+template void Compare(const nv_bfloat16* src,
+                      const nv_bfloat16* ref,
+                      size_t             stride,
+                      int                dims,
+                      int                bsz,
+                      bool               show,
+                      float              rtol,
+                      float              atol);
+#endif
+
+void LoadBinary(const std::string& path, size_t size, void* dst)
+{
+    std::ifstream ifs(path, std::ios::binary | std::ios::in);
+    if (!ifs.is_open()) {
+        std::cerr << "failed to open " << path << "\n";
+        std::abort();
+    }
+    ifs.seekg(0, ifs.end);
+    auto actual_size_in_bytes = ifs.tellg();
+    ifs.seekg(0, ifs.beg);
+    if (size != actual_size_in_bytes) {
+        std::cerr << "[warning] file " << path << " has " << actual_size_in_bytes << " bytes, while " << size
+                  << " bytes is requested\n";
+    }
+    ifs.read((char*)dst, size);
+    std::cerr << "[info] " << path << " " << size << "\n";
+}
+
+namespace cg = cooperative_groups;
+
+__global__ void curand_init(curandState* state)
+{
+    auto tid = cg::this_grid().thread_rank();
+    curand_init(0xe4c45822e90461ddULL, tid, 0, state + tid);
+}
+
+template<typename T>
+__global__ void curand_uniform(curandState* state, size_t count, T* result, float scale, float shift)
+{
+    auto grid = cg::this_grid();
+    for (auto i = grid.thread_rank(); i < count; i += grid.size()) {
+        float tmp = curand_uniform(state + grid.thread_rank());
+        result[i] = T(scale * tmp + shift);
+    }
+}
+
+template<typename T>
+__global__ void curand_normal(curandState* state, size_t count, T* result, float scale, float shift)
+{
+    auto grid = cg::this_grid();
+    for (auto i = grid.thread_rank(); i < count; i += grid.size()) {
+        float tmp = curand_normal(state + grid.thread_rank());
+        result[i] = T(scale * tmp + shift);
+    }
+}
+
+__global__ void curand_bytes(curandState* state, size_t count, uint* result)
+{
+    auto grid = cg::this_grid();
+    for (auto i = grid.thread_rank(); i < count; i += grid.size()) {
+        result[i] = curand(state + grid.thread_rank());
+    }
+}
+
+struct RNG::Impl {
+
+    curandState* states{};
+
+    Impl()
+    {
+        cudaMalloc(&states, sizeof(curandState) * 64 * 64);
+        curand_init<<<64, 64>>>(states);
+    }
+
+    ~Impl()
+    {
+        cudaFree(states);
+    }
+
+    void GenerateUInt(uint* out, size_t count)
+    {
+        curand_bytes<<<64, 64, 0, stream_>>>(states, count, out);
+    }
+
+    template<typename T>
+    void GenerateUniform(T* out, size_t count, float scale, float shift)
+    {
+        curand_uniform<<<64, 64, 0, stream_>>>(states, count, out, scale, shift);
+    }
+
+    template<typename T>
+    void GenerateNormal(T* out, size_t count, float scale, float shift)
+    {
+        curand_normal<<<64, 64, 0, stream_>>>(states, count, out, scale, shift);
+    }
+
+    cudaStream_t stream_{};
+};
+
+RNG::RNG(): impl_(std::make_unique<Impl>()) {}
+
+RNG::~RNG() = default;
+
+void RNG::GenerateUInt(uint* out, size_t count)
+{
+    impl_->GenerateUInt(out, count);
+}
+
+template<typename T>
+void RNG::GenerateUniform(T* out, size_t count, float scale, float shift)
+{
+    impl_->GenerateUniform(out, count, scale, shift);
+}
+
+template<typename T>
+void RNG::GenerateNormal(T* out, size_t count, float scale, float shift)
+{
+    impl_->GenerateNormal(out, count, scale, shift);
+}
+
+cudaStream_t RNG::stream() const
+{
+    return impl_->stream_;
+}
+
+void RNG::set_stream(cudaStream_t stream)
+{
+    impl_->stream_ = stream;
+}
+
+template void RNG::GenerateUniform(half* out, size_t count, float scale, float shift);
+template void RNG::GenerateUniform(float* out, size_t count, float scale, float shift);
+#if ENABLE_BF16
+template void RNG::GenerateUniform(nv_bfloat16* out, size_t count, float scale, float shift);
+#endif
+
+template void RNG::GenerateNormal(half* out, size_t count, float scale, float shift);
+template void RNG::GenerateNormal(float* out, size_t count, float scale, float shift);
+#if ENABLE_BF16
+template void RNG::GenerateNormal(nv_bfloat16* out, size_t count, float scale, float shift);
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/test/test_utils.h b/src/turbomind/kernels/gemm/test/test_utils.h
new file mode 100644
index 0000000000..27281a4a47
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/test_utils.h
@@ -0,0 +1,44 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/macro.h"
+#include <cuda_fp16.h>
+#include <memory>
+
+namespace turbomind {
+
+template<typename T>
+void Compare(const T* src,
+             const T* ref,
+             size_t   stride,
+             int      dims,
+             int      bsz,
+             bool     show = false,
+             float    rtol = 1e-2,
+             float    atol = 1e-4);
+
+void LoadBinary(const std::string& path, size_t size, void* dst);
+
+class RNG {
+public:
+    RNG();
+    ~RNG();
+    void GenerateUInt(uint* out, size_t count);
+
+    template<typename T>
+    void GenerateUniform(T* out, size_t count, float scale = 1.f, float shift = 0.f);
+
+    template<typename T>
+    void GenerateNormal(T* out, size_t count, float scale = 1.f, float shift = 0.f);
+
+    cudaStream_t stream() const;
+
+    void set_stream(cudaStream_t stream);
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
new file mode 100644
index 0000000000..3ca1239729
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -0,0 +1,422 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/test/quantization.h"
+#include "src/turbomind/kernels/gemm/test/reference.h"
+#include "src/turbomind/kernels/gemm/test/test_utils.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <thrust/universal_vector.h>
+#include <type_traits>
+
+namespace turbomind::gemm {
+
+using thrust::universal_vector;
+
+#define CHECK(cond)                                                                                                    \
+    do {                                                                                                               \
+        if (!(cond)) {                                                                                                 \
+            fprintf(stderr, "*** Check failed: (%s) @ %s:%d\n", #cond, __FILE__, __LINE__);                            \
+            std::abort();                                                                                              \
+        }                                                                                                              \
+    } while (0)
+
+template<class Ta,
+         class Tb,
+         class Tc,
+         int   batch_dim,
+         Order order_a,
+         Order order_b,
+         Order order_c,
+         Pack  pack_a,
+         Pack  pack_b,
+         Pack  pack_u = 0,
+         Pack  pack_v = 0>
+class Testbed {
+public:
+    static constexpr int kBatchDim = batch_dim;
+
+    Testbed(): dispatch_policy_{DispatchPolicy::kDefault} {}
+
+    Testbed(DispatchPolicy dispatch_policy, std::string cache_path):
+        dispatch_policy_{dispatch_policy}, cache_path_{cache_path}
+    {
+        if (dispatch_policy & DispatchPolicy::kReuse) {
+            std::ifstream ifs(cache_path);
+            if (ifs.is_open()) {
+                gemm_.Import(ifs);
+            }
+            else {
+                std::cerr << "failed to import dispatch cache from \"" << cache_path << "\"" << std::endl;
+            }
+        }
+    }
+
+    ~Testbed()
+    {
+        if (dispatch_policy_ & DispatchPolicy::kMeasure) {
+            std::ofstream ofs(cache_path_);
+            if (ofs.is_open()) {
+                gemm_.Export(ofs);
+            }
+            else {
+                std::cerr << "failed to export dispatch cache to \"" << cache_path_ << "\"" << std::endl;
+            }
+        }
+    }
+
+    void Initialize(int m, int n, int k, int g, cudaStream_t stream)
+    {
+        rng_.set_stream(stream);
+        reference_.set_stream(stream);
+        stream_ = stream;
+
+        m_ = m;
+        n_ = n;
+        k_ = k;
+
+        a_.resize(m * k);
+        b_.resize(n * k);
+        c_.resize(m * n);
+
+        a_desc_ = MatrixLayout{get_data_type_v<Tc>, order_a, m, k, mk2cs<order_a>(m, k).x};
+        b_desc_ = MatrixLayout{get_data_type_v<Tc>, order_b, k, n, _kn2cs<order_b>(k, n).x};
+        c_desc_ = MatrixLayout{get_data_type_v<Tc>, order_c, m, n, mk2cs<order_c>(m, n).x};
+
+        c_f_.resize(c_.size());
+        c_ref_.resize(c_.size());
+
+        // a_q_.resize(a_.size());
+        // b_q_.resize(b_.size());
+
+        // u_.resize(a_.size());
+        // v_.resize(b_.size());
+
+        // a_f_.resize(a_.size());
+        // b_f_.resize(b_.size());
+
+        /// TODO: Revise packed format
+        a_pack_.resize(a_.size() / kVecSize);
+        b_pack_.resize(b_.size() / kVecSize);
+
+        barriers_.resize(Gemm::kBarriersSize);
+        partials_.resize(Gemm::kPartialsSize);
+
+        rng_.GenerateUniform(a_.data().get(), a_.size(), 1, -.5f);
+        rng_.GenerateUniform(b_.data().get(), b_.size(), 1, -.5f);
+
+        for (int i = 0; i < n; ++i) {
+            // for (int j = 0; j < k; ++j) {
+            //     b_[i * k + j] = i * k + j;
+            // }
+            // for (int j = 0; j < k; j += 2) {
+            //     b_[i * k + j]     = i;
+            //     b_[i * k + j + 1] = j;
+            // }
+        }
+
+        a_f_ = a_;
+        b_f_ = b_;
+
+        a_pack_desc_ = a_desc_;
+        b_pack_desc_ = b_desc_;
+        u_pack_desc_ = {};
+        v_pack_desc_ = {};
+
+        constexpr bool is_quant_a = !std::is_same_v<Ta, Tc>;
+        constexpr bool is_quant_b = !std::is_same_v<Tb, Tc>;
+
+        if constexpr (is_quant_a) {
+            static_assert(pack_a && pack_u);
+            Quantize<Ta>(a_, m, k, order_a, g, a_f_, a_q_, u_, stream);
+            u_pack_desc_ = u_desc_ = {DataType::U32, kColMajor, m, ceil_div(k, g), m};
+            u_pack_desc_.pack      = pack_u;
+            u_pack_.resize(u_.size());
+            CHECK(!Convert(u_.data().get(), u_desc_, u_pack_.data().get(), u_pack_desc_, stream_));
+            quant_a_ = {QuantType::kDefault, g};
+
+            // cudaDeviceSynchronize();
+
+            // for (int i = 0; i < u_pack_.size(); ++i) {
+            //     std::cout << (float)u_pack_[i] << " ";
+            // }
+            // std::cout << "\n";
+        }
+
+        // b (k, n) -> v is always row major
+        if constexpr (is_quant_b) {
+            static_assert(pack_b && pack_v);
+            constexpr Order _order_b = transpose(order_b);
+            Quantize<Tb>(b_, n, k, _order_b, g, b_f_, b_q_, v_, stream);
+            v_pack_desc_ = v_desc_ = {DataType::U32, kRowMajor, ceil_div(k, g), n, n};
+            v_pack_desc_.pack      = pack_v;
+            v_pack_.resize(v_.size());
+            CHECK(!Convert(v_.data().get(), v_desc_, v_pack_.data().get(), v_pack_desc_, stream_));
+            quant_b_ = {QuantType::kDefault, g};
+
+            // cudaDeviceSynchronize();
+
+            // for (int i = 0; i < v_pack_.size(); ++i) {
+            //     std::cout << (float)v_pack_[i] << " ";
+            // }
+            // std::cout << "\n";
+        }
+
+        if constexpr (pack_a) {
+            a_pack_desc_.type = get_data_type_v<Ta>;
+            a_pack_desc_.pack = pack_a;
+            const auto a_data = is_quant_a ? (void*)a_q_.data().get() : (void*)a_.data().get();
+            CHECK(!Convert(a_data, a_desc_, a_pack_.data().get(), a_pack_desc_, stream_));
+        }
+        else {
+            cudaMemcpyAsync(
+                (Ta*)a_pack_.data().get(), a_.data().get(), sizeof(Ta) * a_.size(), cudaMemcpyDefault, stream);
+        }
+
+        if constexpr (pack_b) {
+            b_pack_desc_.type = get_data_type_v<Tb>;
+            b_pack_desc_.pack = pack_b;
+            const auto b_data = is_quant_b ? (void*)b_q_.data().get() : (void*)b_.data().get();
+            CHECK(!Convert(b_data, b_desc_, b_pack_.data().get(), b_pack_desc_, stream_));
+
+            // {
+            //     cudaDeviceSynchronize();
+            //     for (int i = 0; i < n; ++i) {
+            //         for (int j = 0; j < k; j += 2) {
+            //             // int index = (int)((Tb*)b_pack_.data().get())[i * k + j];
+            //             // int row   = index / k;
+            //             // int col   = index % k;
+            //             int row = (int)((Tb*)b_pack_.data().get())[i * k + j];
+            //             int col = (int)((Tb*)b_pack_.data().get())[i * k + j + 1];
+            //             printf("(%2d,%2d) ", row, col);
+            //         }
+            //         printf("\n");
+            //     }
+            // }
+        }
+        else {
+            cudaMemcpyAsync(
+                (Tb*)b_pack_.data().get(), b_.data().get(), sizeof(Tb) * b_.size(), cudaMemcpyDefault, stream);
+        }
+    }
+
+    void Run()
+    {
+        const Operation operation{
+            dispatch_policy_,
+            Epilogue::kNone,
+            quant_a_,
+            quant_b_,
+            kBatchDim,
+        };
+
+        const Workspace workspace{barriers_.data().get(), barriers_.size(), partials_.data().get(), partials_.size()};
+
+        auto status = gemm_.Run(operation,
+                                1.f,
+                                a_pack_.data().get(),
+                                a_pack_desc_,
+                                u_pack_.data().get(),
+                                u_pack_desc_,
+                                b_pack_.data().get(),
+                                b_pack_desc_,
+                                v_pack_.data().get(),
+                                v_pack_desc_,
+                                0.f,
+                                c_.data().get(),
+                                c_desc_,
+                                c_.data().get(),
+                                c_desc_,
+                                workspace,
+                                stream_);
+
+        if (status) {
+            std::cerr << "Run failed, code =" << status << "\n";
+            std::abort();
+        }
+    }
+
+    void RunCublas()
+    {
+        reference_.gemm(a_f_.data().get(),  //
+                        a_desc_,
+                        b_f_.data().get(),
+                        b_desc_,
+                        c_f_.data().get(),
+                        c_desc_);
+    }
+
+    void CompareB()
+    {
+        cudaDeviceSynchronize();
+        Compare(b_f_.data().get(), b_.data().get(), k_, k_, n_);
+    }
+
+    void CompareC()
+    {
+        for (int i = 0; i < 10; ++i) {
+            reference_.gemm(a_f_.data().get(),  //
+                            a_desc_,
+                            b_f_.data().get(),
+                            b_desc_,
+                            c_ref_.data().get(),
+                            c_desc_);
+        }
+
+        // c_f_.resize(m_ * n_);
+        // computeRefCublas(c_f_.data().get(), a_.data().get(), b_f_.data().get(), m_, n_, k_, stream_);
+        // RunCublas();
+
+        cudaDeviceSynchronize();
+
+        // Compare(c_f_.data().get(), c_ref_.data().get(), n_, n_, m_, 0);
+
+        // Compare(c_.data().get(), c_f_.data().get(), n_, n_, m_, 0);
+
+        if (order_c == kRowMajor) {
+            Compare(c_.data().get(), c_ref_.data().get(), n_, n_, m_, 0);
+        }
+        else {
+            Compare(c_.data().get(), c_ref_.data().get(), m_, m_, n_, 0);
+        }
+    }
+
+    int64_t global_memory_reads()
+    {
+        return get_size(a_pack_desc_) + get_size(b_pack_desc_) + get_size(u_pack_desc_) + get_size(v_pack_desc_);
+    }
+
+    int64_t ref_global_memory_reads()
+    {
+        return get_size(a_desc_) + get_size(b_desc_);
+    }
+
+private:
+    int m_{};
+    int n_{};
+    int k_{};
+    int g_{};
+
+    universal_vector<Tc> a_;      // A in fp
+    universal_vector<Tc> b_;      // B in fp
+    universal_vector<Tc> c_ref_;  // reference C
+    universal_vector<Tc> c_;      // buffer for C
+
+    // shared with `*_f_` variants
+    MatrixLayout a_desc_;
+    MatrixLayout b_desc_;
+    MatrixLayout c_desc_;
+
+    universal_vector<uint16_t> a_q_;  // quantized a
+    universal_vector<uint16_t> b_q_;  // quantized B
+    universal_vector<Tc>       u_;    // quant param of `a_q_`
+    universal_vector<Tc>       v_;    // quant param of `b_q_`
+
+    // descs for converting to packed format
+    MatrixLayout a_q_desc_;
+    MatrixLayout b_q_desc_;
+    MatrixLayout u_desc_;
+    MatrixLayout v_desc_;
+
+    universal_vector<Tc> a_f_;  // dequant `a_q_` back to fp
+    universal_vector<Tc> b_f_;  // dequant `b_q_` back to fp
+    universal_vector<Tc> c_f_;  // ref C computed by `b_f_`
+
+    static constexpr int kVecSize = 8;
+
+    universal_vector<Array<Ta, kVecSize>> a_pack_;  // packed A
+    universal_vector<Array<Tb, kVecSize>> b_pack_;  // packed B
+
+    universal_vector<Tc> u_pack_;  // packed U
+    universal_vector<Tc> v_pack_;  // packed V
+
+    MatrixLayout a_pack_desc_;
+    MatrixLayout b_pack_desc_;
+    MatrixLayout u_pack_desc_;
+    MatrixLayout v_pack_desc_;
+
+    QuantDesc quant_a_{};
+    QuantDesc quant_b_{};
+
+    universal_vector<char> barriers_;
+    universal_vector<char> partials_;
+
+    cudaStream_t stream_;
+
+    RNG rng_;
+
+    Gemm           gemm_;
+    Reference      reference_;
+    DispatchPolicy dispatch_policy_;
+    std::string    cache_path_;
+};
+
+template<class T>
+T& gTestbed()
+{
+    static auto policy = [&] {
+        const auto str    = std::getenv("TM_GEMM_TEST_POLICY");
+        auto       policy = turbomind::gemm::DispatchPolicy::kDefault;
+        using namespace turbomind::gemm;
+        if (str) {
+            using namespace std::string_view_literals;
+            if (str == "measure"sv) {
+                policy = DispatchPolicy::kMeasure;
+            }
+            else if (str == "reuse"sv) {
+                policy = DispatchPolicy::kReuse;
+            }
+            else if (str == "append"sv) {
+                policy = DispatchPolicy::kAppend;
+            }
+            else {
+                std::cerr << "unrecognized policy: " << std::quoted(str) << ", default policy will be used.\n";
+            }
+        }
+        return policy;
+    }();
+
+    static T inst{policy, "tm_cache"};
+    return inst;
+}
+
+inline decltype(auto) get_test()
+{
+    if constexpr (0) {
+        // native
+        return gTestbed<gemm::Testbed<half, half, half, 0, kRowMajor, kColMajor, kRowMajor, 0, 0, 0, 0>>();
+    }
+    else if constexpr (0) {
+        // sm80 / sm75
+        constexpr Pack kPackA = HMMA_16816 | OPERAND_A | 2;
+        constexpr Pack kPackU = HMMA_16816 | OPERAND_U | 1;
+        return gTestbed<gemm::Testbed<uint4_t, half, half, 1, kColMajor, kColMajor, kColMajor, kPackA, 0, kPackU, 0>>();
+    }
+    else if constexpr (1) {
+        // sm80 / sm75
+        constexpr Pack kPackB = HMMA_16816 | OPERAND_B | 2;
+        constexpr Pack kPackV = HMMA_16816 | OPERAND_V | 1;
+        return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kRowMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
+    }
+    else if constexpr (0) {
+        // sm70
+        constexpr Pack kPackB = HMMA_884 | OPERAND_B | 1;
+        constexpr Pack kPackV = HMMA_884 | OPERAND_V | 1;
+        return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
+    }
+    else if constexpr (0) {
+        // simt
+        constexpr Pack kPackB = HMMA_SIMT | OPERAND_B | 1;
+        constexpr Pack kPackV = HMMA_SIMT | OPERAND_V | 1;
+        return gTestbed<gemm::Testbed<half, uint4_t, half, 0, kRowMajor, kColMajor, kRowMajor, 0, kPackB, 0, kPackV>>();
+    }
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/thread_group_map.h b/src/turbomind/kernels/gemm/thread_group_map.h
new file mode 100644
index 0000000000..47fc62d5de
--- /dev/null
+++ b/src/turbomind/kernels/gemm/thread_group_map.h
@@ -0,0 +1,117 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+
+#include <iostream>
+
+namespace turbomind::gemm {
+
+template<int M_, int N_, int K_, int TM, int TN, int TK, int GM, int GN, int GK>
+struct RakedThreadGroupMap {
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+    static constexpr int K = K_;
+
+    static constexpr int TileM = TM;
+    static constexpr int TileN = TN;
+    static constexpr int TileK = TK;
+
+    static constexpr int kGroupM = GM;
+    static constexpr int kGroupN = GN;
+    static constexpr int kGroupK = GK;
+
+    static constexpr int kGroupCount = GM * GN * GK;
+
+    static constexpr int M1 = GM * TM;
+    static constexpr int N1 = GN * TN;
+    static constexpr int K1 = GK * TK;
+
+    static constexpr int kIterM = M / M1;
+    static constexpr int kIterN = N / N1;
+    static constexpr int kIterK = K / K1;
+
+    static constexpr int kFootprintM = kIterM * TM;
+    static constexpr int kFootprintN = kIterN * TN;
+    static constexpr int kFootprintK = kIterK * TK;
+
+    static constexpr int kDeltaM = TM;
+    static constexpr int kDeltaN = TN;
+    static constexpr int kDeltaK = TK;
+
+    __device__ static int3 get_offset(int group_id)
+    {
+        const int m = group_id % GM;
+        const int n = group_id / GM % GN;
+        const int k = group_id / GM / GN;
+        return {m * kFootprintM, n * kFootprintN, k * kFootprintK};
+    }
+};
+
+template<int M_, int N_, int K_, int tM_, int tN_, int tK_, class ArrangementMN, int gK, bool rK = 0>
+struct MMA_Map {
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+    static constexpr int K = K_;
+
+    static constexpr int TileM = tM_;
+    static constexpr int TileN = tN_;
+    static constexpr int TileK = tK_;
+
+    static constexpr int kGroupM = ArrangementMN::gM;
+    static constexpr int kGroupN = ArrangementMN::gN;
+    static constexpr int kGroupK = gK;
+
+    static constexpr int kGroupCount = kGroupM * kGroupN * kGroupK;
+
+    static constexpr int kIterM = M / tM_ / kGroupM;
+    static constexpr int kIterN = N / tN_ / kGroupN;
+    static constexpr int kIterK = K / tK_ / kGroupK;
+
+    static constexpr int kFootprintM = kIterM * tM_;
+    static constexpr int kFootprintN = kIterN * tN_;
+    static constexpr int kFootprintK = kIterK * tK_;
+
+    static constexpr int kDeltaM = tM_ * ArrangementMN::dM;
+    static constexpr int kDeltaN = tN_ * ArrangementMN::dN;
+    static constexpr int kDeltaK = tK_ * (rK ? gK : 1);
+
+    static constexpr auto kPartitionM = ArrangementMN::pM;
+    static constexpr auto kPartitionN = ArrangementMN::pN;
+    static constexpr auto kPartitionK = rK ? Partition::kRaked : Partition::kBlocked;
+
+    __device__ static int3 get_offset(int group_id)
+    {
+        constexpr int kGroupMN = kGroupM * kGroupN;
+
+        const auto mn = ArrangementMN::get_offset(group_id % kGroupMN, pair<M / TileM, N / TileN>{});
+        const int  k  = group_id / kGroupMN;
+
+        return {mn.x * tM_, mn.y * tN_, k * tK_ * (rK ? 1 : kIterK)};
+    }
+};
+
+namespace {
+
+template<class TMap>
+void Print_(TMap)
+{
+    std::cout << "M, N, K = " << TMap::M << " " << TMap::N << " " << TMap::K << "\n";
+    std::cout << "TM, TN, TK = " << TMap::TileM << " " << TMap::TileN << " " << TMap::TileK << "\n";
+    std::cout << "group count = " << TMap::kGroupCount << "\n";
+    // std::cout << "M1, N1, K1 = " << TMap::M1 << " " << TMap::N1 << " " << TMap::K1 << "\n";
+    std::cout << "itM, itN, itK = " << TMap::kIterM << " " << TMap::kIterN << " " << TMap::kIterK << "\n";
+    std::cout << "fpM, fpN, fpK = " << TMap::kFootprintM << " " << TMap::kFootprintN << " " << TMap::kFootprintK
+              << "\n";
+    std::cout << "dM, dN, dK = " << TMap::kDeltaM << " " << TMap::kDeltaN << " " << TMap::kDeltaK << "\n";
+}
+
+}  // namespace
+
+/// TODO: Striped partition?
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/thread_map.h b/src/turbomind/kernels/gemm/thread_map.h
new file mode 100644
index 0000000000..03f2828bb8
--- /dev/null
+++ b/src/turbomind/kernels/gemm/thread_map.h
@@ -0,0 +1,246 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+
+#include "src/turbomind/kernels/gemm/types.h"
+
+#include <iostream>
+
+namespace turbomind::gemm {
+
+template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = std::min(WARP_SIZE, DimC / AccessC)>
+struct ThreadMap {
+    static constexpr int kDimC = DimC;
+    static constexpr int kDimS = DimS;
+
+    static constexpr int kWarpCount = WarpCount;
+    static constexpr int kAccessC   = AccessC;
+
+    static constexpr int kWarpThreadC = WarpThreadC;
+    static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC;
+
+    static_assert(kWarpThreadC <= WARP_SIZE);
+
+    static constexpr int kWarpAccessC = kWarpThreadC * kAccessC;
+    static constexpr int kWarpAccessS = kWarpThreadS;
+
+    static constexpr int kWarpIterC = ceil_div(kDimC, kWarpAccessC);
+    static constexpr int kWarpIterS = ceil_div(kDimS, kWarpAccessS);
+
+    // Partition warps along the strided axis first to reduce strided iters
+    static constexpr int kWarpS = kWarpIterS >= kWarpCount ? kWarpCount : kWarpIterS;
+    static constexpr int kWarpC = kWarpCount > kWarpIterS ? kWarpCount / kWarpS : 1;
+
+    static constexpr int kIterC = ceil_div(kWarpIterC, kWarpC);
+    static constexpr int kIterS = ceil_div(kWarpIterS, kWarpS);
+
+    // Allow partial tile when there is ONLY 1 iteration
+    static_assert(kDimC % kWarpAccessC == 0 || kIterC == 1);
+
+    // static_assert(kIterC > 0);
+    // static_assert(kIterS > 0);
+
+    static constexpr bool kAlignedC = (kDimC % kWarpAccessC == 0) && (kWarpIterC % kWarpC == 0);
+    static constexpr bool kAlignedS = (kDimS % kWarpAccessS == 0) && (kWarpIterS % kWarpS == 0);
+
+    static constexpr int kFootprintC = kWarpAccessC * kIterC;
+    static constexpr int kFootprintS = kWarpAccessS * kIterS;
+
+    static constexpr int kDeltaC = kWarpAccessC;
+    static constexpr int kDeltaS = kWarpAccessS;
+
+    // static constexpr int kDeltaC = kWarpAccessC * kWarpC;
+    // static constexpr int kDeltaS = kWarpAccessS * kWarpS;
+
+    __device__ static int2 get_offset(int warp_id, int lane_id)
+    {
+        int warp_offset_c = warp_id % kWarpC;
+        int warp_offset_s = warp_id / kWarpC;
+
+        int warp_thread_offset_c = lane_id % kWarpThreadC;
+        int warp_thread_offset_s = lane_id / kWarpThreadC;
+
+        int cta_thread_offset_c = kFootprintC * warp_offset_c + warp_thread_offset_c * kAccessC;
+        int cta_thread_offset_s = kFootprintS * warp_offset_s + warp_thread_offset_s;
+
+        // int cta_thread_offset_c = kWarpAccessC * warp_offset_c + warp_thread_offset_c * kAccessC;
+        // int cta_thread_offset_s = kWarpAccessS * warp_offset_s + warp_thread_offset_s;
+
+        return {cta_thread_offset_c, cta_thread_offset_s};
+    }
+};
+
+template<Order order, int M, int K>
+__host__ __device__ static constexpr int2 idx2mk(int idx, pair<M, K>)
+{
+    if constexpr (order == kColMajor) {
+        return {idx % M, idx / M};
+    }
+    else {
+        return {idx / K, idx % K};
+    }
+}
+
+enum class Partition
+{
+    kBlocked,
+    kRaked,
+};
+
+template<int gM_, int gN_, Order order>
+struct Blocked {
+    static constexpr int gM = gM_;
+    static constexpr int gN = gN_;
+
+    // static_assert((gM - 1) * sM + (gN - 1) * sN == gM * gN - 1);
+
+    static constexpr int dM = 1;
+    static constexpr int dN = 1;
+
+    static constexpr Partition pM = Partition::kBlocked;
+    static constexpr Partition pN = Partition::kBlocked;
+
+    template<int M, int N>
+    __device__ static int2 get_offset(int idx, pair<M, N>)
+    {
+        constexpr int iM = ceil_div(M, gM);
+        constexpr int iN = ceil_div(N, gN);
+
+        // const int mi = idx / sM % gM;
+        // const int ni = idx / sN % gN;
+
+        const int2 mn = idx2mk<order>(idx, pair<gM, gN>{});
+        return {mn.x * iM, mn.y * iN};
+    }
+};
+
+template<int gM_, int gN_, Order order>
+struct Raked {
+    static constexpr int gM = gM_;
+    static constexpr int gN = gN_;
+
+    // static_assert((gM - 1) * sM + (gN - 1) * sN == gM * gN - 1);
+
+    static constexpr int dM = gM;
+    static constexpr int dN = gN;
+
+    static constexpr Partition pM = Partition::kRaked;
+    static constexpr Partition pN = Partition::kRaked;
+
+    template<class Shape>
+    __device__ static int2 get_offset(int idx, Shape)
+    {
+        return idx2mk<order>(idx, pair<gM, gN>{});
+    }
+};
+
+template<int gM_, int gN_, Order order>
+struct Blocked_C_Raked_S {
+    static constexpr int gM = gM_;
+    static constexpr int gN = gN_;
+
+    static constexpr int dM = 1;
+    static constexpr int dN = gN;
+
+    static constexpr Partition pM = Partition::kBlocked;
+    static constexpr Partition pN = Partition::kRaked;
+
+    template<int M, int N>
+    __device__ static int2 get_offset(int idx, pair<M, N>)
+    {
+        constexpr int iM = ceil_div(M, gM);
+
+        const int2 mn = idx2mk<order>(idx, pair<gM, gN>{});
+        return {mn.x * iM, mn.y};
+    }
+};
+
+template<int C,
+         int S,
+         int AccessC,
+         template<int, int, Order>
+         typename Arrangement_,
+         int WarpCount,
+         int WarpThrC = std::min(WARP_SIZE, C / AccessC)>
+struct ThreadMap_V2 {
+    static constexpr int kDimC = C;
+    static constexpr int kDimS = S;
+
+    static constexpr int kWarpCount = WarpCount;
+    static constexpr int kAccessC   = AccessC;
+
+    static_assert(WarpThrC <= WARP_SIZE);
+
+    static constexpr int kWarpThreadC = WarpThrC;
+    static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC;
+
+    static constexpr int kWarpAccessC = kWarpThreadC * kAccessC;
+    static constexpr int kWarpAccessS = kWarpThreadS;
+
+    static constexpr int kWarpIterC = ceil_div(kDimC, kWarpAccessC);
+    static constexpr int kWarpIterS = ceil_div(kDimS, kWarpAccessS);
+
+    static constexpr int kWarpS = kWarpIterS >= kWarpCount ? kWarpCount : kWarpIterS;
+    static constexpr int kWarpC = kWarpCount > kWarpIterS ? kWarpCount / kWarpS : 1;
+
+    using Arrangement = Arrangement_<kWarpC, kWarpS, kColMajor>;
+
+    static constexpr auto kPartitionM = Arrangement::pM;
+    static constexpr auto kPartitionN = Arrangement::pN;
+
+    static constexpr int kIterC = ceil_div(kWarpIterC, kWarpC);
+    static constexpr int kIterS = ceil_div(kWarpIterS, kWarpS);
+
+    static constexpr bool kAlignedC = (kDimC % kWarpAccessC == 0) && (kWarpIterC % kWarpC == 0);
+    static constexpr bool kAlignedS = (kDimS % kWarpAccessS == 0) && (kWarpIterS % kWarpS == 0);
+
+    static constexpr int kFootprintC = kWarpAccessC * kIterC;
+    static constexpr int kFootprintS = kWarpAccessS * kIterS;
+
+    static constexpr int kDeltaC = kWarpAccessC * Arrangement::dM;
+    static constexpr int kDeltaS = kWarpAccessS * Arrangement::dN;
+
+    __device__ static int2 get_offset(int warp_id, int lane_id)
+    {
+        const int2 warp_offset = Arrangement::get_offset(warp_id, pair<kWarpIterC, kWarpIterS>{});
+
+        int warp_thr_offset_c = lane_id % kWarpThreadC;
+        int warp_thr_offset_s = lane_id / kWarpThreadC;
+
+        if constexpr (kWarpThreadC == WARP_SIZE) {
+            warp_thr_offset_c = lane_id;
+            warp_thr_offset_s = 0;
+        }
+
+        const int offset_c = warp_offset.x * kWarpAccessC + warp_thr_offset_c * kAccessC;
+        const int offset_s = warp_offset.y * kWarpAccessS + warp_thr_offset_s;
+
+        return {offset_c, offset_s};
+    }
+};
+
+namespace {
+
+template<class TMap>
+void Print(TMap)
+{
+    std::cout << "     warps: " << TMap::kWarpCount << "\n";
+    std::cout << "     shape: (" << TMap::kDimC << ", " << TMap::kDimS << ")\n";
+    std::cout << "    access: (" << TMap::kAccessC << ", " << 1 << ")\n";
+    std::cout << "warpThread: (" << TMap::kWarpThreadC << ", " << TMap::kWarpThreadS << ")\n";
+    std::cout << "warpAccess: (" << TMap::kWarpAccessC << ", " << TMap::kWarpAccessS << ")\n";
+    std::cout << "  warpIter: (" << TMap::kWarpIterC << ", " << TMap::kWarpIterS << ")\n";
+    std::cout << "      warp: (" << TMap::kWarpC << ", " << TMap::kWarpS << ")\n";
+    std::cout << "      iter: (" << TMap::kIterC << ", " << TMap::kIterS << ")\n";
+    std::cout << " footprint: (" << TMap::kFootprintC << ", " << TMap::kFootprintS << ")\n";
+    std::cout << "     delta: (" << TMap::kDeltaC << ", " << TMap::kDeltaS << ")\n";
+    std::cout << "   aligned: (" << TMap::kAlignedC << "," << TMap::kAlignedS << ")\n";
+}
+
+}  // namespace
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tiled_mma.h b/src/turbomind/kernels/gemm/tiled_mma.h
new file mode 100644
index 0000000000..7d131ce318
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tiled_mma.h
@@ -0,0 +1,209 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/core/mma.h"
+#include "src/turbomind/kernels/core/smem.h"
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/simt.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/thread_map.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gemm/utils.h"
+
+namespace turbomind::gemm {
+
+template<class MMA_Atom_, class MMA_Map_>
+struct Tiled_MMA_v2 {
+    using Atom = MMA_Atom_;
+    using Map  = MMA_Map_;
+
+    static constexpr int M = Map::M;
+    static constexpr int N = Map::N;
+    static constexpr int K = Map::K;
+
+    static constexpr int kGroupCount  = Map::kGroupCount;
+    static constexpr int kThreadCount = kGroupCount * Atom::kThreadCount;
+
+    static constexpr int kTileIterM = Map::kIterM;
+    static constexpr int kTileIterN = Map::kIterN;
+    static constexpr int kTileIterK = Map::kIterK;
+
+    static constexpr int kDeltaM = Map::kDeltaM;
+    static constexpr int kDeltaN = Map::kDeltaN;
+    static constexpr int kDeltaK = Map::kDeltaK;
+
+    static constexpr int kAtomM = Map::TileM / Atom::M;
+    static constexpr int kAtomN = Map::TileN / Atom::N;
+    static constexpr int kAtomK = Map::TileK / Atom::K;
+
+    static constexpr int kMmaIterM = kTileIterM * kAtomM;
+    static constexpr int kMmaIterN = kTileIterN * kAtomN;
+    static constexpr int kMmaIterK = kTileIterK * kAtomK;
+
+    __device__ static int3 get_offset(int thread_idx)
+    {
+        return Map::get_offset(Atom::get_group_id(thread_idx));
+    }
+};
+
+template<class MMA>
+struct Rearrange {
+    using Map  = typename MMA::Map;
+    using Atom = typename MMA::Atom;
+
+    template<class T, int V, int M, int N, class Layout, Order order, int TM, int TN>
+    __device__ static void
+    apply(Array<T, V> (&frag_C)[M][N], SmemAccessorV2<T, Layout, order>& smem_C, int2 offset_mn, pair<TM, TN>)
+    {
+        const int3 offset_mnk = MMA::get_offset(threadIdx.x);
+        const int  group_id_k = offset_mnk.z / Map::kFootprintK;
+
+        constexpr bool kRakedM = Map::kPartitionM == Partition::kRaked;
+        constexpr bool kRakedN = Map::kPartitionN == Partition::kRaked;
+
+        static constexpr int2 kMN0 = cs2mk<order>(Layout::C0, Layout::S0);
+
+        constexpr int kPeriodM  = ceil_div(kMN0.x, Map::kDeltaM);
+        constexpr int kPeriodN  = ceil_div(kMN0.y, Map::kDeltaN);
+        constexpr int kPeriodM1 = ceil_div(kMN0.x, Atom::M);
+        constexpr int kPeriodN1 = ceil_div(kMN0.y, Atom::N);
+
+        constexpr auto offset_C = Atom::static_offset_C();
+        const int2     thr      = Atom::thread_offset_C();
+
+        // Contract: All these indices is not a part of swizzling
+        int phases[kPeriodM][kPeriodN][kPeriodM1][kPeriodN1][offset_C.size()];
+        PRAGMA_UNROLL
+        for (int m = 0; m < kPeriodM; ++m) {
+            PRAGMA_UNROLL
+            for (int n = 0; n < kPeriodN; ++n) {
+                PRAGMA_UNROLL
+                for (int m1 = 0; m1 < kPeriodM1; ++m1) {
+                    PRAGMA_UNROLL
+                    for (int n1 = 0; n1 < kPeriodN1; ++n1) {
+                        const int mm = offset_mnk.x + m * Map::kDeltaM + m1 * Atom::M + thr.x;
+                        const int nn = offset_mnk.y + n * Map::kDeltaN + n1 * Atom::N + thr.y;
+                        PRAGMA_UNROLL
+                        for (int i = 0; i < offset_C.size(); ++i) {
+                            const int2 cs           = mk2cs<order>(mm + offset_C[i].x, nn + offset_C[i].y);
+                            phases[m][n][m1][n1][i] = Layout::apply(cs.y, cs.x);
+                        }
+                    }
+                }
+            }
+        }
+
+        constexpr int K = Map::kGroupK;
+        constexpr int C = offset_C.size();
+
+        int offsets[K][M][N][C];
+        int masks[K][M][N][C];
+
+        PRAGMA_UNROLL
+        for (int k = 0; k < K; ++k) {
+            PRAGMA_UNROLL
+            for (int m = 0; m < M; ++m) {
+                PRAGMA_UNROLL
+                for (int n = 0; n < N; ++n) {
+                    int m0 = m / MMA::kAtomM, m1 = m % MMA::kAtomM, n0 = n / MMA::kAtomN, n1 = n % MMA::kAtomN;
+                    int m01 =
+                        m0 / kPeriodM * kPeriodM * Map::kDeltaM + m1 / kPeriodM1 * kPeriodM1 * Atom::M - offset_mn.x;
+                    int n01 =
+                        n0 / kPeriodN * kPeriodN * Map::kDeltaN + n1 / kPeriodN1 * kPeriodN1 * Atom::N - offset_mn.y;
+                    const int2 cs       = mk2cs<order>(m01, n01);
+                    int        offset_0 = Layout::apply(cs.y, cs.x);
+                    PRAGMA_UNROLL
+                    for (int i = 0; i < offset_C.size(); ++i) {
+                        int offset_1        = phases[m0 % kPeriodM][n0 % kPeriodN][m1 % kPeriodM1][n1 % kPeriodN1][i];
+                        offsets[k][m][n][i] = offset_0 + offset_1;
+                        const int bm        = offset_mnk.x - offset_mn.x + m0 * Map::kDeltaM + m1 * Atom::M + thr.x;
+                        const int bn        = offset_mnk.y - offset_mn.y + n0 * Map::kDeltaN + n1 * Atom::N + thr.y;
+                        const int mm        = kRakedM ? m01 : bm;
+                        const int nn        = kRakedN ? n01 : bn;
+                        masks[k][m][n][i]   = (Map::kGroupK == 1 || group_id_k == k)
+                                            && (TM >= Map::M || (0 <= mm && mm < TM))
+                                            && (TN >= Map::N || (0 <= nn && nn < TN));
+                    }
+                }
+            }
+        }
+
+        auto _store = [](auto ptr, auto offset, auto vec) {
+            if constexpr (order == kRowMajor) {
+                Store(&ptr[offset], vec);
+            }
+            else {
+                for (int i = 0; i < vec.size(); ++i) {
+                    ptr[offset + Layout::apply(i, 0)] = vec[i];
+                }
+            }
+        };
+
+        typename Atom::FragC_ reshape_C;
+
+        auto ptr = &smem_C(0, 0);
+
+        PRAGMA_UNROLL
+        for (int m = 0; m < M; ++m) {
+            PRAGMA_UNROLL
+            for (int n = 0; n < N; ++n) {
+                Atom::ReshapeC(frag_C[m][n], reshape_C);
+                PRAGMA_UNROLL
+                for (int c = 0; c < C; ++c) {
+                    auto& vec    = reshape_C[c];
+                    int   offset = offsets[0][m][n][c];
+                    if (masks[0][m][n][c]) {
+                        _store(ptr, offset, vec);
+                    }
+                }
+            }
+        }
+
+        __syncthreads();
+
+#if 1
+        auto _load = [](auto ptr, auto offset, auto& vec) {
+            if constexpr (order == kRowMajor) {
+                Load(vec, &ptr[offset]);
+            }
+            else {
+                for (int i = 0; i < vec.size(); ++i) {
+                    vec[i] = ptr[offset + Layout::apply(i, 0)];
+                }
+            }
+        };
+
+        PRAGMA_UNROLL
+        for (int k = 1; k < K; ++k) {
+            PRAGMA_UNROLL
+            for (int m = 0; m < M; ++m) {
+                PRAGMA_UNROLL
+                for (int n = 0; n < N; ++n) {
+                    Atom::ReshapeC(frag_C[m][n], reshape_C);
+                    PRAGMA_UNROLL
+                    for (int c = 0; c < C; ++c) {
+                        auto& vec    = reshape_C[c];
+                        int   offset = offsets[k][m][n][c];
+                        if (masks[k][m][n][c]) {
+                            std::remove_reference_t<decltype(vec)> tmp;
+                            _load(ptr, offset, tmp);
+                            {
+                                using namespace ops;
+                                vec = vec + tmp;
+                            }
+                            _store(ptr, offset, vec);
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+#endif
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/transform.h b/src/turbomind/kernels/gemm/transform.h
new file mode 100644
index 0000000000..bcb059152f
--- /dev/null
+++ b/src/turbomind/kernels/gemm/transform.h
@@ -0,0 +1,120 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/attention/quantization.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/meta.h"
+#include "src/turbomind/kernels/gemm/smem_copy.h"
+#include "src/turbomind/kernels/gemm/tiled_mma.h"
+#include <iterator>
+
+namespace turbomind::gemm {
+
+struct Transform_Default {
+    template<class T, int Nf, int Mf, int K, int Nd, int Md, class S>
+    __device__ static void apply(Array<T, Nf> (&frag)[K][Mf], int k, Array<T, Nd> (&data)[K][Md], S&, int div)
+    {
+        static_assert(Nf * Mf == Nd * Md);
+        static_assert(Nd % Nf == 0 && Mf % Md == 0);
+        static_assert(sizeof(frag) == sizeof(data));
+
+        // Alignment must be manually enforced for `reinterpret_cast`
+        auto& frag_k = reinterpret_cast<Array<T, Nd>(&)[Md]>(frag[k]);
+        auto& data_k = data[k];
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < std::size(frag_k); ++i) {
+            frag_k[i] = data_k[i];
+        }
+    }
+};
+
+template<int StatStepS, int StatStepC>
+struct Transform_HMMA_16816 {
+    template<class F, int Nf, int Mf, int K, class D, int Nd, int Md, class S, int Ns, int Ms, int Ks>
+    __device__ static void
+    apply(Array<F, Nf> (&frag)[K][Mf], int k, Array<D, Nd> (&data)[K][Md], Array<S, Ns> (&stat)[Ks][Ms], int div)
+    {
+        static_assert(Nf * Mf == Nd * Md);
+        static_assert(Nd % Nf == 0 && Mf % Md == 0);
+        static_assert(Nf * Mf == Ns * Ms * 4);
+
+        // static_assert(Nf != Nf);
+
+        auto& frag_k = reinterpret_cast<Array<F, Nd>(&)[Md]>(frag[k]);
+        auto& stat_k = reinterpret_cast<Array<S, 1>(&)[Ns * Ms]>(stat[k / div]);
+        auto& data_k = data[k];
+
+        PRAGMA_UNROLL
+        for (int m = 0; m < Md; ++m) {
+            // if (threadIdx.x == 0) {
+            //     printf("m = %d\n", m);
+            // }
+            auto tmp = ConvertKvCache<D, F>::convert(data_k[m]);
+            PRAGMA_UNROLL
+            for (int i = 0; i < Nd; i += 8) {
+                PRAGMA_UNROLL
+                for (int s = 0; s < 2; ++s) {
+                    PRAGMA_UNROLL
+                    for (int c = 0; c < 2; ++c) {
+                        const int idx = (m * Nd + i) / 8 * 2 + s * StatStepS + c * StatStepC;
+                        // if (threadIdx.x == 0) {
+                        //     printf("idx=%d\n", idx);
+                        // }
+                        dequant((Array<F, 2>&)tmp[i + s * 4 + c * 2], stat_k[idx]);
+                    }
+                }
+            }
+
+            frag_k[m] = tmp;
+        }
+    }
+
+    template<class F>
+    __device__ static void dequant(Array<F, 2>& x, Array<uint32_t, 1> s)
+    {
+        Array<F, 2>& _s = (Array<F, 2>&)s;
+        // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)_s[0], (float)_s[1]);
+        // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)x[0], (float)x[1]);
+        x[0] = __hfma(x[0], _s[0], _s[1]);
+        x[1] = __hfma(x[1], _s[0], _s[1]);
+    }
+};
+
+struct Transform_HMMA_SIMT_B {
+    template<class F, int Nf, int Mf, int K, class D, int Nd, int Md, class S, int Ns, int Ms, int Ks>
+    __device__ static void
+    apply(Array<F, Nf> (&frag)[K][Mf], int k, Array<D, Nd> (&data)[K][Md], Array<S, Ns> (&stat)[Ks][Ms], int div)
+    {
+        static_assert(Nf * Mf == Nd * Md);
+        static_assert(Nd % Nf == 0 && Mf % Md == 0);
+
+        auto& frag_k = reinterpret_cast<Array<F, Nd>(&)[Md]>(frag[k]);
+        auto& stat_k = reinterpret_cast<Array<S, 1>(&)[Ns * Ms]>(stat[k / div]);
+        auto& data_k = data[k];
+
+        // static_assert(Nf != Nf);
+
+        PRAGMA_UNROLL
+        for (int m = 0; m < Md; ++m) {
+            auto tmp = ConvertKvCache<D, F>::convert(data_k[m]);
+            PRAGMA_UNROLL
+            for (int i = 0; i < Nd; i += 2) {
+                dequant((Array<F, 2>&)tmp[i], stat_k[(m * Nd + i) / Nf]);
+            }
+            frag_k[m] = tmp;
+        }
+    }
+
+    template<class F>
+    __device__ static void dequant(Array<F, 2>& x, Array<uint32_t, 1> s)
+    {
+        Array<F, 2>& _s = (Array<F, 2>&)s;
+
+        x[0] = __hfma(x[0], _s[0], _s[1]);
+        x[1] = __hfma(x[1], _s[0], _s[1]);
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/cache_utils.cu b/src/turbomind/kernels/gemm/tuner/cache_utils.cu
new file mode 100644
index 0000000000..6d88139715
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/cache_utils.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
+
+namespace turbomind::gemm {
+
+CacheFlushing::CacheFlushing()
+{
+    cudaDeviceProp props{};
+    cudaGetDeviceProperties(&props, 0);
+
+    size_ = props.l2CacheSize;
+
+    cudaMalloc(&buffer_, size_);
+}
+
+void CacheFlushing::flush(cudaStream_t stream)
+{
+    thread_local CacheFlushing inst{};
+    inst(stream);
+}
+
+void CacheFlushing::operator()(cudaStream_t stream) const
+{
+    cudaMemsetAsync(buffer_, 0, size_, stream);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/cache_utils.h b/src/turbomind/kernels/gemm/tuner/cache_utils.h
new file mode 100644
index 0000000000..d9ce66d51a
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/cache_utils.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <cstdint>
+
+namespace turbomind::gemm {
+
+class CacheFlushing {
+public:
+    static void flush(cudaStream_t stream = {});
+
+private:
+    CacheFlushing();
+    void operator()(cudaStream_t stream) const;
+
+    uint32_t* buffer_;
+    size_t    size_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/measurer.cu b/src/turbomind/kernels/gemm/tuner/measurer.cu
new file mode 100644
index 0000000000..a33d78f6f7
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/measurer.cu
@@ -0,0 +1,84 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
+#include "src/turbomind/kernels/gemm/tuner/measurer.h"
+#include <iostream>
+
+namespace turbomind::gemm {
+
+Measurer::Measurer(std::unique_ptr<StoppingCriterion> stop_criterion): stop_criterion_{std::move(stop_criterion)}
+{
+    cudaEventCreate(&ev_beg_);
+    cudaEventCreate(&ev_end_);
+}
+
+Measurer::~Measurer()
+{
+    cudaEventDestroy(ev_beg_);
+    cudaEventDestroy(ev_end_);
+    ev_beg_ = ev_end_ = {};
+}
+
+std::vector<Measurement>
+Measurer::Measure(const std::vector<LaunchSpec>& specs, const Launcher& launcher, cudaStream_t stream)
+{
+    std::vector<Measurement> m;
+    m.reserve(specs.size());
+    for (const auto& spec : specs) {
+        auto measure = MeasureOne(spec, launcher, stream);
+        if (measure.sample_count) {
+            m.push_back(measure);
+        }
+        /// TODO: report error
+    }
+    return m;
+}
+
+Measurement Measurer::MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream)
+{
+    Stats       stats{};
+    cudaError_t status = cudaSuccess;
+    while (true) {
+        float ms{};
+        std::tie(ms, status) = ColdRun(spec, launcher, stream);
+        if (status != cudaSuccess) {
+            break;
+        }
+        stats.add_sample(ms);
+        // std::cout << spec.kernel->name() << " " << spec.swizzle << " " << stats.count() << " " << stats.mean() << " "
+        //           << stats.get_variance() << "\n";
+        if (stop_criterion_->should_stop(stats)) {
+            break;
+        }
+    }
+    return Measurement{
+        status,
+        stats.count(),
+        stats.mean(),
+        stats.get_variance(),
+    };
+}
+
+std::pair<float, cudaError_t> Measurer::ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream)
+{
+    CacheFlushing::flush(stream);
+
+    cudaEventRecord(ev_beg_, stream);
+
+    launcher(spec, stream);
+
+    cudaEventRecord(ev_end_, stream);
+    cudaEventSynchronize(ev_end_);
+
+    const auto status = cudaGetLastError();
+    float      ms{};
+
+    if (status == cudaSuccess) {
+        cudaEventElapsedTime(&ms, ev_beg_, ev_end_);
+    }
+
+    return {ms, status};
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/measurer.h b/src/turbomind/kernels/gemm/tuner/measurer.h
new file mode 100644
index 0000000000..ec47792208
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/measurer.h
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
+#include <climits>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace turbomind::gemm {
+
+struct Measurement {
+    cudaError_t status;
+    int         sample_count;
+    float       mean;
+    float       variance;
+};
+
+using Launcher = std::function<int(LaunchSpec, cudaStream_t)>;
+
+class Measurer {
+public:
+    Measurer(std::unique_ptr<StoppingCriterion> stop_criterion);
+
+    ~Measurer();
+
+    std::vector<Measurement>
+    Measure(const std::vector<LaunchSpec>& specs, const Launcher& launcher, cudaStream_t stream);
+
+private:
+    Measurement MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
+
+    std::pair<float, cudaError_t> ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
+
+private:
+    cudaEvent_t                        ev_beg_;
+    cudaEvent_t                        ev_end_;
+    std::unique_ptr<StoppingCriterion> stop_criterion_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/params.cc b/src/turbomind/kernels/gemm/tuner/params.cc
new file mode 100644
index 0000000000..8d7a862e37
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/params.cc
@@ -0,0 +1,104 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/tuner/params.h"
+#include "src/turbomind/utils/parser.h"
+#include <algorithm>
+#include <iostream>
+#include <regex>
+
+namespace turbomind::gemm {
+
+void ParseTuningParams(TuningParams& params, const std::string& str)
+{
+    const auto list = ParseArgsList(str);
+
+    auto try_parse = [&](auto& value, auto name) {
+        auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == name; });
+        if (it != list.end()) {
+            std::cout << name << " " << it->second << "\n";
+            Parse(value, it->second);
+        }
+    };
+
+    try_parse(params.max_splits, "max_splits");
+    try_parse(params.max_waves, "max_waves");
+    try_parse(params.swizzle, "swizzle");
+    try_parse(params.top_k, "top_k");
+    try_parse(params.clusters, "clusters");
+    try_parse(params.min_iter, "min_iter");
+    try_parse(params.max_iter, "max_iter");
+    try_parse(params.max_time, "max_time");
+
+    if (auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == "seq"; }); it != list.end()) {
+        params.seq = ParseTuningSequence(it->second);
+    }
+}
+
+std::vector<int> ParseTuningSequence(const std::string& str)
+{
+    const std::regex triplet(R"((\d+)-(\d+)-(\d+))");
+
+    std::vector<std::array<int, 3>> generators;
+
+    const auto tokens = ParseListOrTuple(str);
+
+    for (const auto& token : tokens) {
+        std::smatch match;
+        if (std::regex_match(token, match, triplet)) {
+            generators.push_back({std::stoi(match[1].str()),  //
+                                  std::stoi(match[2].str()),
+                                  std::stoi(match[3].str())});
+        }
+        else {  // must be an integer string
+            generators.push_back({std::stoi(token), 0, 0});
+        }
+    }
+
+    if (generators.size() == 1) {  // Replace sentinel of the default generators
+        auto fallback   = GetDefaultTuningGenerators();
+        fallback.back() = {generators.front().front(), 0, 0};
+        generators      = std::move(fallback);
+    }
+
+    return GenerateTuningSequence(generators);
+}
+
+std::vector<int> GenerateTuningSequence(const std::vector<std::array<int, 3>>& generators)
+{
+    std::vector<int> ret;
+    if (generators.empty()) {
+        return ret;
+    }
+    const int last = generators.back().front();
+    // The last generator is a sentinel `(max_bs, 0, 0)`
+    for (int i = 0; i < (int)generators.size() - 1; ++i) {
+        auto [curr, next, step] = generators[i];
+        if (curr >= last) {
+            break;
+        }
+        if (next == 0 && step == 0) {  // single value
+            ret.push_back(curr);
+        }
+        else {  // generator
+            const int end = std::min(generators[i + 1][0], last);
+            while (curr < end) {
+                ret.push_back(curr);
+                if (curr == next) {
+                    step *= 2;
+                    next *= 2;
+                }
+                curr += step;
+            }
+        }
+    }
+    ret.push_back(last);
+    return ret;
+}
+
+std::vector<std::array<int, 3>> GetDefaultTuningGenerators()
+{
+    /// TODO: set generators based on device
+    return {{8, 16, 8}, {16, 64, 16}, {8192}};
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/params.h b/src/turbomind/kernels/gemm/tuner/params.h
new file mode 100644
index 0000000000..fd3367431c
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/params.h
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace turbomind::gemm {
+
+struct TuningParams {
+    // Split-k params
+    int max_splits = 8;
+    int max_waves  = 10;
+
+    // Swizzling params
+    std::vector<int> swizzle{3};
+
+    // Sampling params
+    float top_k    = 0;
+    int   clusters = 5;
+    int   min_iter = 1;
+    int   max_iter = 10;
+    float max_time = 1.f;
+
+    std::vector<int> seq;
+};
+
+// example
+//   max_splits=8,top_splits=5,max_waves=16,top_k=10,swizzle=[2,3,4],clusters=5,max_iter=10,min_iter=1,max_time=10.0
+void ParseTuningParams(TuningParams& params, const std::string& str);
+
+// example
+//   16-16-128,256-128-1024,8192
+std::vector<int> ParseTuningSequence(const std::string& str);
+
+std::vector<int> GenerateTuningSequence(const std::vector<std::array<int, 3>>& generators);
+
+std::vector<std::array<int, 3>> GetDefaultTuningGenerators();
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/sampler.cu b/src/turbomind/kernels/gemm/tuner/sampler.cu
new file mode 100644
index 0000000000..20954f24ff
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/sampler.cu
@@ -0,0 +1,79 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/tuner/sampler.h"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+namespace turbomind::gemm {
+
+template<class Cmp>
+static std::vector<int> ArgSort(size_t size, const Cmp& cmp)
+{
+    std::vector<int> idxs(size);
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::stable_sort(idxs.begin(), idxs.end(), cmp);
+    return idxs;
+}
+
+std::vector<LaunchSpec> Sampler::Run(std::vector<LaunchSpec> specs, const Launcher& launcher, cudaStream_t stream)
+{
+    std::vector<std::vector<LaunchSpec>> clusters;  // ptr into `specs`
+    if (k_clusters_) {
+        clusters = Cluster(specs, ClusteringParam{true, true});
+    }
+    else {
+        for (auto& s : specs) {
+            clusters.push_back({s});
+        }
+    }
+    // std::cout << "k_clusters=" << k_clusters_ << ", #specs" << specs.size() << ", #clusters" << clusters.size() <<
+    // "\n";
+
+    std::vector<LaunchSpec> s_1;
+    for (const auto& c : clusters) {
+        s_1.push_back(c.front());
+    }
+
+    auto m_1 = measurer_.Measure(s_1, launcher, stream);
+
+    auto idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; });
+
+    if (k_clusters_) {
+        const auto top_k = std::min(k_clusters_, (int)idxs.size());
+        idxs.resize(top_k);
+
+        std::vector<LaunchSpec> s_2;
+        for (const auto& idx : idxs) {
+            auto& cluster = clusters[idx];
+            // Skip cluster leader
+            for (size_t j = 1; j < cluster.size(); ++j) {
+                s_2.push_back(cluster[j]);
+            }
+        }
+
+        // std::cout << "#s_2=" << s_2.size() << "\n";
+
+        auto m_2 = measurer_.Measure(s_2, launcher, stream);
+        // Merge measurements of the 2 runs
+        m_2.insert(m_2.end(), m_1.begin(), m_1.end());
+        s_2.insert(s_2.end(), s_1.begin(), s_1.end());
+        m_1.swap(m_2);
+        s_1.swap(s_2);
+    }
+
+    idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; });
+
+    std::vector<LaunchSpec> ret;
+    for (const auto& i : idxs) {
+        s_1[i].measured = m_1[i].mean;
+        ret.push_back(s_1[i]);
+    }
+
+    return ret;
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/sampler.h b/src/turbomind/kernels/gemm/tuner/sampler.h
new file mode 100644
index 0000000000..72fdbed690
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/sampler.h
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/desc.h"
+#include "src/turbomind/kernels/gemm/tuner/measurer.h"
+
+#include <vector>
+
+namespace turbomind::gemm {
+
+class Sampler {
+public:
+    explicit Sampler(Measurer& measurer, int k_clusters): measurer_{measurer}, k_clusters_{k_clusters} {}
+
+    std::vector<LaunchSpec> Run(std::vector<LaunchSpec> specs, const Launcher& launcher, cudaStream_t stream);
+
+private:
+    Measurer& measurer_;
+    int       k_clusters_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/stats.h b/src/turbomind/kernels/gemm/tuner/stats.h
new file mode 100644
index 0000000000..9925efcc94
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/stats.h
@@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <limits>
+
+namespace turbomind::gemm {
+
+class Stats {
+public:
+    Stats(): count_{}, mean_{}, m2_{} {}
+
+    float mean() const noexcept
+    {
+        return mean_;
+    }
+
+    float sum() const noexcept
+    {
+        return mean_ * count_;
+    }
+
+    int count() const noexcept
+    {
+        return count_;
+    }
+
+    float get_variance() const noexcept
+    {
+        return count_ < 2 ? std::numeric_limits<float>::quiet_NaN() : m2_ / count_;
+    }
+
+    void add_sample(float x) noexcept
+    {
+        ++count_;
+        float delta = x - mean_;
+        mean_ += delta / count_;
+        float delta2 = x - mean_;
+        m2_ += delta * delta2;
+    }
+
+private:
+    int   count_;
+    float mean_;
+    float m2_;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/stopping_criterion.cc b/src/turbomind/kernels/gemm/tuner/stopping_criterion.cc
new file mode 100644
index 0000000000..5ac77547bd
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/stopping_criterion.cc
@@ -0,0 +1,36 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
+#include <memory>
+
+namespace turbomind::gemm {
+
+namespace stopping_criterions {
+
+class Optimistic: public StoppingCriterion {
+public:
+    Optimistic(int min_iter, int max_iter, float max_ms)
+    {
+        min_iter_ = std::max(min_iter, 1);
+        max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits<int>::max();
+        max_ms_   = max_ms > 0 ? max_ms : std::numeric_limits<float>::infinity();
+    }
+    bool should_stop(const Stats& stats) override
+    {
+        return stats.count() >= min_iter_ && (stats.count() >= max_iter_ || stats.sum() >= max_ms_);
+    }
+
+private:
+    int   min_iter_;
+    int   max_iter_;
+    float max_ms_;
+};
+
+}  // namespace stopping_criterions
+
+std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms)
+{
+    return std::make_unique<stopping_criterions::Optimistic>(min_iter, max_iter, max_ms);
+}
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/tuner/stopping_criterion.h b/src/turbomind/kernels/gemm/tuner/stopping_criterion.h
new file mode 100644
index 0000000000..efac3c39ca
--- /dev/null
+++ b/src/turbomind/kernels/gemm/tuner/stopping_criterion.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/tuner/stats.h"
+#include <memory>
+
+namespace turbomind::gemm {
+
+class StoppingCriterion {
+public:
+    virtual ~StoppingCriterion()                 = default;
+    virtual bool should_stop(const Stats& stats) = 0;
+};
+
+std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms);
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/types.h b/src/turbomind/kernels/gemm/types.h
new file mode 100644
index 0000000000..6821de0134
--- /dev/null
+++ b/src/turbomind/kernels/gemm/types.h
@@ -0,0 +1,240 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/core/data_type.h"
+#include <cuda_fp16.h>
+#if ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
+
+namespace turbomind::gemm {
+
+enum class Order : int
+{
+    kColMajor = 0,
+    kRowMajor = 1,
+};
+
+inline constexpr Order kColMajor = Order::kColMajor;
+inline constexpr Order kRowMajor = Order::kRowMajor;
+
+constexpr Order operator~(Order a)
+{
+    return a == kColMajor ? kRowMajor : kColMajor;
+}
+
+using Pack = uint32_t;
+
+typedef enum MMA_Tag
+{
+    HMMA_16816 = 0x100,  // sm80+
+    HMMA_1688  = 0x200,  // sm75
+    HMMA_884   = 0x300,  // sm70
+    HMMA_SIMT  = 0x400,  // sm75-
+} MMA_Tag;
+
+typedef enum Op_Tag
+{
+    OPERAND_A = 0x010,
+    OPERAND_B = 0x020,
+    OPERAND_U = 0x030,
+    OPERAND_V = 0x040,
+} Op_Tag;
+
+constexpr MMA_Tag get_mma_tag(Pack pack)
+{
+    return static_cast<MMA_Tag>(pack & 0xf00);
+}
+
+constexpr Op_Tag get_operand_tag(Pack pack)
+{
+    return static_cast<Op_Tag>(pack & 0x0f0);
+}
+
+constexpr int get_pack_num(Pack pack)
+{
+    return pack & 0x00f;
+}
+
+enum class QuantType : int
+{
+    kNone,
+    kDefault,
+};
+
+enum class Epilogue : int
+{
+    kNone               = 0,
+    kChannelCombination = 0x1,
+    kGatedSilu          = 0x2,
+};
+
+enum class DataType : int
+{
+    U4,
+    U8,
+    U16,
+    U32,
+    U64,
+    F8_E4M3,
+    F8_E5M2,
+    F16,
+    F32,
+    BF16,
+    TF32,
+};
+
+inline const char* to_string(DataType data_type)
+{
+    switch (data_type) {
+        case DataType::U4:
+            return "u4";
+        case DataType::U8:
+            return "u8";
+        case DataType::F16:
+            return "f16";
+        case DataType::F32:
+            return "f32";
+        case DataType::BF16:
+            return "bf16";
+        case DataType::TF32:
+            return "tf32";
+        default:
+            return "unknown";
+    }
+}
+
+inline int64_t get_size(DataType type, int64_t size)
+{
+    if (!size) {
+        return 0;
+    }
+    switch (type) {
+        case DataType::U64:
+            return size * 8;
+        case DataType::F32:
+        case DataType::U32:
+            return size * 4;
+        case DataType::BF16:
+        case DataType::F16:
+        case DataType::U16:
+            return size * 2;
+        case DataType::U8:
+        case DataType::F8_E4M3:
+        case DataType::F8_E5M2:
+            return size;
+        case DataType::U4:
+            return size / 2;
+        default:
+            // std::cerr << to_string(type) << "\n";
+            return -1;
+    }
+}
+
+template<class T>
+struct get_data_type {
+};
+
+template<>
+struct get_data_type<half> {
+    static constexpr auto value = DataType::F16;
+};
+
+#if ENABLE_BF16
+template<>
+struct get_data_type<nv_bfloat16> {
+    static constexpr auto value = DataType::BF16;
+};
+#endif
+
+template<>
+struct get_data_type<uint4_t> {
+    static constexpr auto value = DataType::U4;
+};
+
+template<>
+struct get_data_type<uint8_t> {
+    static constexpr auto value = DataType::U8;
+};
+
+template<class T>
+inline constexpr auto get_data_type_v = get_data_type<T>::value;
+
+template<DataType dtype>
+struct get_dtype {
+};
+
+template<>
+struct get_dtype<DataType::F16> {
+    using type = half;
+};
+
+template<>
+struct get_dtype<DataType::U4> {
+    using type = uint4_t;
+};
+
+template<>
+struct get_dtype<DataType::U8> {
+    using type = uint8_t;
+};
+
+template<>
+struct get_dtype<DataType::U16> {
+    using type = uint16_t;
+};
+
+template<>
+struct get_dtype<DataType::U32> {
+    using type = uint32_t;
+};
+
+struct QuantDesc {
+    QuantType type;
+    int       group_size;
+};
+
+enum class DispatchPolicy : int
+{
+    kDefault = 0,
+    kMeasure = 1,
+    kReuse   = 2,
+    kAppend  = 3,
+};
+
+constexpr bool operator&(const DispatchPolicy& a, const DispatchPolicy& b)
+{
+    return ((int)a & (int)b);
+}
+
+struct Operation {
+    DispatchPolicy dispatch;
+    Epilogue       epilogue;
+    QuantDesc      quant_a;
+    QuantDesc      quant_b;
+    int            batch_dim;
+};
+
+struct MatrixLayout {
+    DataType type;
+    Order    order;
+    int      rows;
+    int      cols;
+    int      ld;
+    Pack     pack;
+};
+
+inline int64_t get_size(const MatrixLayout& m)
+{
+    return get_size(m.type, (int64_t)m.rows * m.cols);
+}
+
+struct Workspace {
+    void*  barriers;
+    size_t barriers_size;
+    void*  partials;
+    size_t partials_size;
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm/unpack.cu b/src/turbomind/kernels/gemm/unpack.cu
new file mode 100644
index 0000000000..92f468d82b
--- /dev/null
+++ b/src/turbomind/kernels/gemm/unpack.cu
@@ -0,0 +1,87 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/data_type.h"
+#include <iostream>
+
+namespace turbomind {
+
+namespace {
+
+__device__ void atomic_assign_u4(uint32_t* address, uint32_t index, uint32_t value)
+{
+    uint32_t old = *address;
+    uint32_t assumed;
+    do {
+        assumed      = old;
+        uint32_t tmp = (assumed & ~(0xfu << (index * 4u))) | (value << (index * 4u));
+        old          = atomicCAS(address, assumed, tmp);
+    } while (assumed != old);
+}
+
+__device__ uint32_t read_u4(const uint32_t* address, uint32_t index)
+{
+    return (*address >> (index * 4u)) & 0xfu;
+}
+
+template<int... Ds>
+__global__ void permute_u4(uint* dst, const uint* src, Array<int, sizeof...(Ds)> dims)
+{
+    constexpr int N = sizeof...(Ds);
+
+    size_t count = 1;
+    PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+        count *= dims[i];
+    }
+
+    constexpr int order[] = {Ds...};
+
+    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
+
+        int indices[N]{};
+
+        PRAGMA_UNROLL
+        for (int j = N - 1, ii = i; j >= 0; --j) {
+            indices[j] = ii % dims[j];
+            ii /= dims[j];
+        }
+
+        auto data = read_u4(src + i / 8, i % 8);
+
+        int index = 0;
+
+        PRAGMA_UNROLL
+        for (int j = N - 1, stride = 1; j >= 0; --j) {
+            index += indices[order[j]] * stride;
+            stride *= dims[order[j]];
+        }
+
+        atomic_assign_u4(dst + index / 8, index % 8, data);
+    }
+}
+
+}  // namespace
+
+// col-major interleaved
+void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaStream_t st)
+{
+    Array<int, 4> shape{cols, rows / 8, 2, 4};
+    permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+}
+
+void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
+{
+    if (s % 8 || c % 8) {
+        std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
+        return;
+    }
+    Array<int, 2> shape{s, c};
+    permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
+}
+
+// load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
+// load -> extend_to_u16 -> convert -> run
+
+}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm/utils.h b/src/turbomind/kernels/gemm/utils.h
new file mode 100644
index 0000000000..965ea8d224
--- /dev/null
+++ b/src/turbomind/kernels/gemm/utils.h
@@ -0,0 +1,128 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/kernels/gemm/simt.h"
+#include "src/turbomind/kernels/gemm/types.h"
+
+namespace turbomind::gemm {
+
+__host__ __device__ constexpr Order transpose(Order order)
+{
+    return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
+}
+
+__host__ __device__ constexpr MatrixLayout transpose(MatrixLayout x)
+{
+    auto tmp = x.cols;  // `std::swap` is not constexpr
+    x.cols   = x.rows;
+    x.rows   = tmp;
+    x.order  = transpose(x.order);
+    return x;
+}
+
+template<Order order>
+__host__ __device__ constexpr int2 mk2cs(int m, int k)
+{
+    if constexpr (order == Order::kRowMajor) {
+        return {k, m};
+    }
+    else {
+        return {m, k};
+    }
+}
+
+template<Order order>
+__host__ __device__ constexpr int2 mk2cs(int2 mk)
+{
+    return mk2cs<order>(mk.x, mk.y);
+}
+
+template<Order order>
+__host__ __device__ constexpr int2 cs2mk(int c, int s)
+{
+    if constexpr (order == Order::kRowMajor) {
+        return {s, c};
+    }
+    else {
+        return {c, s};
+    }
+}
+
+template<Order order>
+__host__ __device__ constexpr int2 _kn2cs(int k, int n)
+{
+    if constexpr (order == Order::kColMajor) {
+        return {k, n};
+    }
+    else {
+        return {n, k};
+    }
+}
+
+template<class Index>
+__host__ __device__ constexpr Index cs2idx(int2 cs, Index ld)
+{
+    return ld * cs.y + cs.x;
+}
+
+template<MMA_Tag mma, Op_Tag op, int num, Order order>
+struct PackingImpl {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        return mk;
+    }
+};
+
+template<Pack pack, Order order>
+struct Packing_v2: PackingImpl<get_mma_tag(pack), get_operand_tag(pack), get_pack_num(pack), order> {
+};
+
+/// TODO: move packing utility to arch/smem_copy_xxx
+
+template<int num>
+struct PackingImpl<HMMA_16816, OPERAND_A, num, kRowMajor> {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        return {mk.x / 16 / num, mk.y * 16 * num};
+    }
+};
+
+template<int num>
+struct PackingImpl<HMMA_16816, OPERAND_A, num, kColMajor> {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        return {mk.x * 16, mk.y / 16};
+    }
+};
+
+template<int num, Order order>
+struct PackingImpl<HMMA_16816, OPERAND_B, num, order>: PackingImpl<HMMA_16816, OPERAND_A, num, order> {
+};
+
+template<int num>
+struct PackingImpl<HMMA_SIMT, OPERAND_A, num, kRowMajor> {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        return {mk.x / (simt::OP_M * num), mk.y * simt::OP_M * num};
+    }
+};
+
+template<int num>
+struct PackingImpl<HMMA_SIMT, OPERAND_B, num, kRowMajor> {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        return {mk.x / (simt::OP_N * num), mk.y * simt::OP_N * num};
+    }
+};
+
+template<int num>
+struct PackingImpl<HMMA_884, OPERAND_B, num, kRowMajor> {
+    __host__ __device__ static constexpr int2 apply(int2 mk)
+    {
+        // return {mk.x / (16 * num), mk.y * 16 * num};
+        return {mk.x / (32 * num), mk.y * 32 * num};
+    }
+};
+
+}  // namespace turbomind::gemm
diff --git a/src/turbomind/kernels/gemm_s_f16/CMakeLists.txt b/src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
deleted file mode 100644
index 3a8199c4c1..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-add_library(gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu)
-target_compile_options(gemm_s4_f16 PRIVATE
-  --generate-line-info -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr)
-set_property(TARGET gemm_s4_f16 PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET gemm_s4_f16 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/kernels/gemm_s_f16/common.h b/src/turbomind/kernels/gemm_s_f16/common.h
deleted file mode 100644
index a5caed691c..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/common.h
+++ /dev/null
@@ -1,668 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "../attention/data_type.h"
-#include "src/turbomind/macro.h"
-#include <cassert>
-#include <cstdint>
-#include <cuda_fp16.h>
-#include <type_traits>
-
-#if ENABLE_BF16
-#include <cuda_bf16.h>
-#endif
-
-namespace turbomind {
-
-#ifndef TURBOMIND_S4_DEQUANT_USE_FMA
-#define TURBOMIND_S4_DEQUANT_USE_FMA 0
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
-#define TURBOMIND_ARCH_SM70 1
-#else
-#define TURBOMIND_ARCH_SM70 0
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
-#define TURBOMIND_ARCH_SM75 1
-#else
-#define TURBOMIND_ARCH_SM75 0
-#endif
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-#define TURBOMIND_ARCH_SM80 1
-#else
-#define TURBOMIND_ARCH_SM80 0
-#endif
-
-constexpr int WARP_SIZE = 32;
-
-#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
-#if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
-#define PRAGMA_UNROLL _Pragma("unroll")
-#define PRAGMA_NO_UNROLL _Pragma("unroll 1")
-#else
-#define PRAGMA_UNROLL #pragma unroll
-#define PRAGMA_NO_UNROLL #pragma unroll 1
-#endif
-#else
-#define PRAGMA_UNROLL
-#define PRAGMA_NO_UNROLL
-#endif
-
-// Modified from NVIDIA FasterTransformer:
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-// Modified from llm-awq https://github.com/mit-han-lab/llm-awq/blob/main/awq/kernels/csrc/quantization/dequantize.cuh
-__inline__ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
-{
-    uint4 result;
-
-    uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
-    uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
-
-    // First, we extract the i4s and construct an intermediate fp16 number.
-    static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
-    static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
-    static constexpr uint32_t TOP_MASK              = 0x00f000f0;
-    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
-
-    // Note that the entire sequence only requires 1 shift instruction. This is
-    // thanks to the register packing format and the fact that we force our
-    // integers to be unsigned, and account for this in the fp16 subtractions. In
-    // addition, I exploit the fact that sub and fma have the same throughput in
-    // order to convert elt_23 and elt_67 to fp16 without having to shift them to
-    // the bottom bits before hand.
-
-    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
-    // dependency if we issue immediately before required.
-    const uint32_t top_i4s = i4s >> 8;
-    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
-    asm("lop3.b32 %0, %1, %2, %3, %4;\n"
-        : "=r"(h[0])
-        : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
-    asm("lop3.b32 %0, %1, %2, %3, %4;\n"
-        : "=r"(h[1])
-        : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
-    asm("lop3.b32 %0, %1, %2, %3, %4;\n"
-        : "=r"(h[2])
-        : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
-    asm("lop3.b32 %0, %1, %2, %3, %4;\n"
-        : "=r"(h[3])
-        : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
-
-    // I use inline PTX below because I am not sure if the compiler will emit
-    // float2half instructions if I use the half2 ctor. In this case, I chose
-    // performance reliability over code readability.
-
-    // This is the half2 {1032, 1032} represented as an integer.
-    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
-    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
-    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
-    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
-    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
-    // This is the half2 {-72, -72} represented as an integer.
-    // static constexpr uint32_t NEG_72 = 0xd480d480;
-    // Haotian: Let's use {-64, -64}.
-    static constexpr uint32_t NEG_64 = 0xd400d400;
-
-    // Finally, we construct the output numbers.
-    // Convert elt_01
-    asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
-    // Convert elt_23
-    asm("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
-    // Convert elt_45
-    asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
-    // Convert elt_67
-    asm("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
-
-    return result;
-}
-
-__inline__ __device__ uint4 dequantize_s4_to_fp16x2_v2(uint32_t const& source)
-{
-    uint4 result;
-
-    uint32_t*       h   = reinterpret_cast<uint32_t*>(&result);
-    uint32_t const& i4s = reinterpret_cast<uint32_t const&>(source);
-
-    // First, we extract the i4s and construct an intermediate fp16 number.
-    static constexpr uint32_t immLut      = (0xf0 & 0xcc) | 0xaa;
-    static constexpr uint32_t BOT_MASK    = 0x000f000f;
-    static constexpr uint32_t TOP_MASK    = 0x00f000f0;
-    static constexpr uint32_t MAGIC_NUM_0 = 0x64006400;        // `1024`
-    static constexpr uint32_t MAGIC_NUM_1 = 0x54005400;        // `64`
-    static constexpr uint32_t MAGIC_NUM_2 = MAGIC_NUM_1 >> 4;  // `64` >> 4
-
-    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
-    // dependency if we issue immediately before required.
-    const uint32_t top_i4s = i4s >> 8;
-
-    if (0) {  // 1024 & 64
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_0), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
-        asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(MAGIC_NUM_0));
-        asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(MAGIC_NUM_1));
-        asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(MAGIC_NUM_0));
-        asm("sub.f16x2 %0, %1, %2;\n" : "=r"(h[3]) : "r"(h[3]), "r"(MAGIC_NUM_1));
-    }
-    else {  //  64 only, trade 4 hfma2 with 2 shifts
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[0]) : "r"(i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[1]) : "r"(i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[2]) : "r"(top_i4s), "n"(BOT_MASK), "n"(MAGIC_NUM_2), "n"(immLut));
-        asm("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(h[3]) : "r"(top_i4s), "n"(TOP_MASK), "n"(MAGIC_NUM_1), "n"(immLut));
-        h[0] <<= 4;
-        h[2] <<= 4;
-        // we don't need to subtract the magic nums because zeros will go through the same dequant function
-        // and carry the same magic constant, the magic num will be canceled out after subtracting zeros
-    }
-
-    return result;
-}
-
-__inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr)
-{
-    return (uint32_t)__cvta_generic_to_shared(ptr);
-}
-
-__inline__ __device__ void ldmatrix_m8n8_x4_b16(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
-{
-#if TURBOMIND_ARCH_SM75
-    asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-                 : "r"(smem_int_ptr));
-#else
-    assert(TURBOMIND_ARCH_SM75);
-#endif
-}
-
-__inline__ __device__ void ldsm_x4_trans(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
-{
-#if TURBOMIND_ARCH_SM75
-    asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
-                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
-                 : "r"(smem_int_ptr));
-#else
-    assert(TURBOMIND_ARCH_SM75);
-#endif
-}
-
-__inline__ __device__ void ldmatrix_m8n8_x2_b16(uint& d0, uint& d1, uint32_t smem_int_ptr)
-{
-#if TURBOMIND_ARCH_SM75
-    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(d0), "=r"(d1) : "r"(smem_int_ptr));
-#else
-    assert(TURBOMIND_ARCH_SM75);
-#endif
-}
-
-__inline__ __device__ int sem_fetch(int* lock, bool pred)
-{
-    int state{};
-    if (pred) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-        asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
-#else
-        asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
-#endif
-    }
-    return state;
-}
-
-__inline__ __device__ void sem_wait(int* lock, int status, bool pred)
-{
-    int state = 0;
-    while (__syncthreads_and(state != status)) {
-        state = sem_fetch(lock, pred);
-    }
-
-    __syncthreads();  // memory fence
-}
-
-__inline__ __device__ void sem_wait_many(int* lock, int count, bool pred)
-{
-    int state = 0;
-    while (__syncthreads_count(state) != count) {
-        state = sem_fetch(lock, pred);
-    }
-
-    __syncthreads();  // memory fence
-}
-
-__inline__ __device__ void sem_post(int* lock, int status, bool pred)
-{
-    __syncthreads();  // memory fence
-
-    if (pred) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-        asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
-#else
-        asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
-#endif
-    }
-}
-
-__inline__ __device__ half2 apply_Q(const half2& x, const half2& q)
-{
-    uint s, z;
-    (half2&)z = __halves2half2(q.x, q.x);
-    (half2&)s = __halves2half2(q.y, q.y);
-
-    auto& t = (const uint&)x;
-    uint  u, v;
-    if (TURBOMIND_S4_DEQUANT_USE_FMA) {
-        asm("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(v) : "r"(t), "r"(s), "r"(z));
-    }
-    else {
-        asm("sub.ftz.f16x2 %0, %1, %2;\n" : "=r"(u) : "r"(t), "r"(z));
-        asm("mul.ftz.f16x2 %0, %1, %2;\n" : "=r"(v) : "r"(u), "r"(s));
-    }
-
-    return (half2&)v;
-}
-
-template<typename T, int N>
-struct Array {
-
-    using value_type      = T;
-    using size_type       = int;
-    using difference_type = int;
-    using reference       = value_type&;
-    using const_reference = const value_type&;
-    using pointer         = value_type*;
-    using const_pointer   = const value_type*;
-    using iterator        = pointer;
-    using const_iterator  = const_pointer;
-
-    static_assert(N > 0);
-
-    T __a[N];
-
-    __device__ __host__ constexpr reference operator[](size_type i) noexcept
-    {
-        return __a[i];
-    }
-    __device__ __host__ constexpr const_reference operator[](size_type i) const noexcept
-    {
-        return __a[i];
-    }
-
-    __device__ __host__ constexpr reference front() noexcept
-    {
-        return *begin();
-    }
-
-    __device__ __host__ constexpr const_reference front() const noexcept
-    {
-        return *begin();
-    }
-
-    __device__ __host__ constexpr reference back() noexcept
-    {
-        return *(end() - 1);
-    }
-
-    __device__ __host__ constexpr const_reference back() const noexcept
-    {
-        return *(end() - 1);
-    }
-
-    __device__ __host__ constexpr pointer data() noexcept
-    {
-        return &__a[0];
-    }
-
-    __device__ __host__ constexpr const_pointer data() const noexcept
-    {
-        return &__a[0];
-    }
-
-    __device__ __host__ constexpr iterator begin() noexcept
-    {
-        return data();
-    }
-
-    __device__ __host__ constexpr const_iterator begin() const noexcept
-    {
-        return data();
-    }
-
-    __device__ __host__ constexpr iterator end() noexcept
-    {
-        return data() + N;
-    }
-
-    __device__ __host__ constexpr const_iterator end() const noexcept
-    {
-        return data() + N;
-    }
-
-    __device__ __host__ constexpr std::integral_constant<int, N> size() const noexcept
-    {
-        return {};
-    }
-
-    __device__ __host__ constexpr std::false_type empty() const noexcept
-    {
-        return {};
-    }
-};
-
-template<int N>
-struct Array<uint4_t, N> {
-    using value_type      = detail::__uint4_t;
-    using size_type       = int;
-    using difference_type = int;
-    using reference       = value_type&;
-    using const_reference = const value_type&;
-    using pointer         = SubBytePtr<uint4_t>;
-    using const_pointer   = SubBytePtr<const uint4_t>;
-
-    static_assert(N % 8 == 0);
-
-    detail::__uint4_t __a[N / 8];
-
-    __device__ __host__ constexpr reference operator[](size_type i) noexcept
-    {
-        return __a[i / 8];
-    }
-    __device__ __host__ constexpr const_reference operator[](size_type i) const noexcept
-    {
-        return __a[i / 8];
-    }
-
-    __device__ __host__ constexpr std::integral_constant<int, N> size() const noexcept
-    {
-        return {};
-    }
-
-    __device__ __host__ constexpr std::false_type empty() const noexcept
-    {
-        return {};
-    }
-
-    __device__ __host__ constexpr pointer data() noexcept
-    {
-        return {(char*)&__a[0]};
-    }
-};
-
-static_assert(sizeof(Array<uint4_t, 8>) == 4);
-static_assert(sizeof(Array<uint4_t, 16>) == 8);
-static_assert(sizeof(Array<uint4_t, 24>) == 12);
-static_assert(sizeof(Array<uint4_t, 32>) == 16);
-
-template<int... Ns>
-struct Shape {
-    static constexpr Array<int, sizeof...(Ns)> data_{Ns...};
-
-    constexpr Shape() = default;
-
-    Shape(std::integral_constant<int, Ns>...){};
-
-    template<int index>
-    constexpr auto get() const noexcept
-    {
-        return std::integral_constant<int, data_[index]>{};
-    }
-
-    constexpr auto m() const noexcept
-    {
-        return get<0>();
-    }
-
-    constexpr auto n() const noexcept
-    {
-        return get<1>();
-    }
-
-    constexpr auto k() const noexcept
-    {
-        return get<2>();
-    }
-
-    constexpr int c() const noexcept
-    {
-        return get<0>();
-    }
-
-    constexpr int s() const noexcept
-    {
-        return get<1>();
-    }
-
-    constexpr int count() const noexcept
-    {
-        return (Ns * ...);
-    }
-};
-
-__inline__ __device__ void
-mma_m16n8k8_row_col(Array<float, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<float, 4>& c)
-{
-#if TURBOMIND_ARCH_SM75
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    float const*    C = reinterpret_cast<float const*>(&c);
-    float*          D = reinterpret_cast<float*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
-                 "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-                 : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-#else
-    assert(TURBOMIND_ARCH_SM75);
-#endif
-}
-
-__inline__ __device__ void
-mma_m16n8k8_row_col(Array<half, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<half, 4>& c)
-{
-#if TURBOMIND_ARCH_SM75
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
-    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16  {%0,%1}, "
-                 "{%2,%3}, {%4}, {%5,%6};\n"
-                 : "=r"(D[0]), "=r"(D[1])
-                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
-#else
-    assert(TURBOMIND_ARCH_SM75);
-#endif
-}
-
-__inline__ __device__ void mma_m16n8k8_row_col(Array<float, 4>&             d,
-                                               const Array<nv_bfloat16, 4>& a,
-                                               const Array<nv_bfloat16, 2>& b,
-                                               Array<float, 4>&             c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    float const*    C = reinterpret_cast<float const*>(&c);
-    float*          D = reinterpret_cast<float*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32  {%0,%1,%2,%3}, "
-                 "{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-                 : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-__inline__ __device__ void mma_m16n8k8_row_col(Array<nv_bfloat16, 4>&       d,
-                                               const Array<nv_bfloat16, 4>& a,
-                                               const Array<nv_bfloat16, 2>& b,
-                                               Array<nv_bfloat16, 4>&       c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
-    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k8.row.col.bf16.bf16.bf16.bf16  {%0,%1}, "
-                 "{%2,%3}, {%4}, {%5,%6};\n"
-                 : "=r"(D[0]), "=r"(D[1])
-                 : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-__inline__ __device__ void
-mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<float, 4>& c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    float const*    C = reinterpret_cast<float const*>(&c);
-    float*          D = reinterpret_cast<float*>(&d);
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-#else
-    const Array<half, 4>* _a = (const Array<half, 4>*)&a;
-    const Array<half, 2>* _b = (const Array<half, 2>*)&b;
-    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
-    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
-#endif
-}
-
-__inline__ __device__ void
-mma_m16n8k16_row_col(Array<half, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<half, 4>& c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
-    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16  {%0,%1}, "
-                 "{%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
-                 : "=r"(D[0]), "=r"(D[1])
-                 : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
-#else
-    const Array<half, 4>* _a = (const Array<half, 4>*)&a;
-    const Array<half, 2>* _b = (const Array<half, 2>*)&b;
-    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
-    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
-#endif
-}
-
-__inline__ __device__ void mma_m16n8k16_row_col(Array<float, 4>&             d,
-                                                const Array<nv_bfloat16, 8>& a,
-                                                const Array<nv_bfloat16, 4>& b,
-                                                Array<float, 4>&             c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    float const*    C = reinterpret_cast<float const*>(&c);
-    float*          D = reinterpret_cast<float*>(&d);
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32  {%0,%1,%2,%3}, "
-        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
-        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
-#else
-    const Array<nv_bfloat16, 4>* _a = (const Array<nv_bfloat16, 4>*)&a;
-    const Array<nv_bfloat16, 2>* _b = (const Array<nv_bfloat16, 2>*)&b;
-    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
-    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
-#endif
-}
-
-__inline__ __device__ void mma_m16n8k16_row_col(Array<nv_bfloat16, 4>&       d,
-                                                const Array<nv_bfloat16, 8>& a,
-                                                const Array<nv_bfloat16, 4>& b,
-                                                Array<nv_bfloat16, 4>&       c)
-{
-#if TURBOMIND_ARCH_SM80
-    uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
-    uint32_t const* C = reinterpret_cast<uint32_t const*>(&c);
-    uint32_t*       D = reinterpret_cast<uint32_t*>(&d);
-    asm volatile("mma.sync.aligned.m16n8k16.row.col.bf16.bf16.bf16.bf16  {%0,%1}, "
-                 "{%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
-                 : "=r"(D[0]), "=r"(D[1])
-                 : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]));
-#else
-    const Array<nv_bfloat16, 4>* _a = (const Array<nv_bfloat16, 4>*)&a;
-    const Array<nv_bfloat16, 2>* _b = (const Array<nv_bfloat16, 2>*)&b;
-    mma_m16n8k8_row_col(d, _a[0], _b[0], c);
-    mma_m16n8k8_row_col(d, _a[1], _b[1], d);
-#endif
-}
-
-__inline__ __device__ void ldsm_x4_trans(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
-{
-    ldsm_x4_trans(d[0], d[1], d[2], d[3], smem_int_ptr);
-}
-
-__inline__ __device__ void ldsm_x4(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
-{
-    ldmatrix_m8n8_x4_b16(d[0], d[1], d[2], d[3], smem_int_ptr);
-}
-
-template<class T, int N>
-__device__ void CpAsync(T* dst, const Array<T, N>* __restrict__ src)
-{
-    const int     smem_int_ptr = cast_smem_ptr_to_uint(dst);
-    constexpr int cp_size      = sizeof(Array<T, N>);
-#if TURBOMIND_ARCH_SM80
-    asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" ::"r"(smem_int_ptr), "l"(src), "n"(cp_size));
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-__inline__ __device__ uint transpose_m8n8_b16_warp_shuffle(uint value)
-{
-    const int lane_id  = threadIdx.x % WARP_SIZE;
-    int       src_lane = lane_id / 8 + lane_id % 4 * 8;
-    uint      u0       = __shfl_sync(0xffffffff, value, src_lane);
-    uint      u1       = __shfl_sync(0xffffffff, value, src_lane + 4);
-    short2    r;
-
-    if (lane_id % 8 < 4) {
-        r.x = ((short2&)u0).x;
-        r.y = ((short2&)u1).x;
-    }
-    else {
-        r.x = ((short2&)u0).y;
-        r.y = ((short2&)u1).y;
-    }
-    return (uint&)r;
-}
-
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
-__inline__ __device__ uint transpose_m8n8_b16_movmatrix(uint a)
-{
-#if TURBOMIND_ARCH_SM75
-    uint d;
-    asm volatile("movmatrix.sync.aligned.m8n8.trans.b16 %0, %1;\n" : "=r"(d) : "r"(a));
-    return d;
-#else
-    assert(TURBOMIND_ARCH_SM75);
-    return 0;
-#endif
-}
-#endif
-
-__inline__ __device__ uint32_t transpose_m8n8_b16(uint32_t a)
-{
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
-    return transpose_m8n8_b16_movmatrix(a);
-#else
-    return transpose_m8n8_b16_warp_shuffle(a);
-#endif
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/cta_iterator.h b/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
deleted file mode 100644
index 2e273472fe..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
+++ /dev/null
@@ -1,646 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "common.h"
-#include <cstddef>
-#include <cstdint>
-
-namespace turbomind {
-
-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
-#define L2_CACHEHINT(size) ".L2::" #size "B"
-#else
-#define L2_CACHEHINT(size)
-#endif
-
-template<typename T>
-__inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const T* __restrict__ src, bool mask)
-{
-#if TURBOMIND_ARCH_SM80
-    constexpr int cp_size = sizeof(T);
-    static_assert(cp_size == 16, "cp.async.cg requreis cp_size == 16");
-    // clang-format off
-    asm volatile("{\n"
-                 "  .reg .pred p;\n"
-                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.cg.shared.global" L2_CACHEHINT(256) " [%1], [%2], %3;\n"
-                 "}\n" ::"r"((int)mask),
-                 "r"(smem_int_ptr),
-                 "l"(src),
-                 "n"(cp_size));
-    // clang-format on
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-template<typename T>
-__inline__ __device__ void cp_async_cg_B(uint32_t smem_int_ptr, const T* __restrict__ src, bool mask)
-{
-#if TURBOMIND_ARCH_SM80
-    constexpr int cp_size = sizeof(T);
-    static_assert(cp_size == 16, "cp.async.cg requreis cp_size == 16");
-    // clang-format off
-    asm volatile("{\n"
-                 "  .reg .pred p;\n"
-                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;\n"
-                 "}\n" ::"r"((int)mask),
-                 "r"(smem_int_ptr),
-                 "l"(src),
-                 "n"(cp_size));
-    // clang-format on
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-template<typename T>
-__inline__ __device__ void cp_async_ca(uint32_t smem_int_ptr, const T* __restrict__ src, bool mask)
-{
-#if TURBOMIND_ARCH_SM80
-    constexpr int cp_size = sizeof(T);
-    // clang-format off
-    asm volatile("{\n"
-                 "  .reg .pred p;\n"
-                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.ca.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;\n"
-                 "}\n" ::"r"((int)mask),
-                 "r"(smem_int_ptr),
-                 "l"(src),
-                 "n"(cp_size));
-    // clang-format on
-#else
-    assert(TURBOMIND_ARCH_SM80);
-#endif
-}
-
-template<int WARPS, int CTA_M, int CTA_N, int CTA_K, int STAGES, int SLICES>
-struct IteratorA {
-    static constexpr int SLICE_K = CTA_K / SLICES;
-
-    using AccessType                 = uint4;
-    static constexpr int kAccessSize = sizeof(AccessType);
-
-    static_assert(CTA_M % 32 == 0 && CTA_K % 32 == 0, "A is pre-formatted as 32x32 tiles");
-
-    // A is [K/32, M/32, WARP_SIZE] uint4
-
-    static constexpr int kShapeM = CTA_M;
-    static constexpr int kShapeK = SLICE_K / 32;
-
-    // thread access shape
-    static constexpr int kAccessM = 1;
-    static constexpr int kAccessK = 1;
-
-    // warp thread arrangement
-    static constexpr int kWarpThreadC = 32;
-    static constexpr int kWarpThreadS = 1;
-
-    // warp shape per access
-    static constexpr int kWarpAccessM = kWarpThreadC * kAccessM;  // 32
-    static constexpr int kWarpAccessK = kWarpThreadS * kAccessK;  // 1
-
-    // warp access iterations
-    static constexpr int kWarpIterM = kShapeM / kWarpAccessM;
-    static constexpr int kWarpIterK = kShapeK / kWarpAccessK;
-
-    // warp arrangement
-    static constexpr int kWarpM = kWarpIterM >= WARPS ? WARPS : kWarpIterM;
-    static constexpr int kWarpK = WARPS > kWarpIterM ? (WARPS / kWarpM) : 1;
-
-    // iterations
-    static constexpr int kIterM = kWarpIterM / kWarpM;
-    static constexpr int kIterK = kWarpIterK / kWarpK;
-
-    static constexpr int kIterCount = kIterM * kIterK;
-    static_assert(kIterCount > 0);
-
-    // warp footprint
-    static constexpr int kWarpFootprintM = kWarpAccessM * kIterM;
-    static constexpr int kWarpFootprintK = kWarpAccessK * kIterK;
-
-    static constexpr int kSizePerStage = kShapeK * kShapeM;
-    static constexpr int kSmemByteSize = kAccessSize * STAGES * kSizePerStage;
-
-    const uint* src_;
-    AccessType* smem_;
-    uint32_t    smem_int_ptr_;
-
-    const int m_;
-    const int k_;
-
-    const int warp_id_;
-    const int lane_id_;
-
-    int src_offset_;
-    int dst_offset_;
-
-    int src_step_m_;
-    int src_step_k_;
-    int src_step_s_;
-
-    int dst_step_m_;
-    int dst_step_k_;
-    int dst_step_s_;
-
-    int iter_m_{0};
-
-    IteratorA() = default;
-
-    __device__ IteratorA(const uint* src, void* smem, int m, int k, int cta_m, int cta_k, int warp_id, int lane_id):
-        src_(src),
-        smem_((AccessType*)smem),
-        smem_int_ptr_(cast_smem_ptr_to_uint(smem)),
-        m_(m),
-        k_(k),
-        warp_id_(warp_id),
-        lane_id_(lane_id)
-    {
-        const int warp_offset_m = warp_id_ % kWarpM;
-        const int warp_offset_k = warp_id_ / kWarpM;
-
-        const int warp_thread_offset_m = lane_id_ % kWarpThreadC;
-        const int warp_thread_offset_k = lane_id_ / kWarpThreadC;
-
-        const int cta_thread_offset_m = kWarpFootprintM * warp_offset_m + warp_thread_offset_m * kAccessM;
-        const int cta_thread_offset_k = kWarpFootprintK * warp_offset_k + warp_thread_offset_k * kAccessK;
-
-        const int src_offset_m = cta_thread_offset_m + cta_m;
-        const int src_offset_k = cta_thread_offset_k + cta_k / 32;
-
-        src_offset_ = src_offset_k * m_ + src_offset_m;
-        src_step_m_ = kWarpAccessM;
-        src_step_k_ = kWarpAccessK * m_ - kIterM * kWarpAccessM;
-        src_step_s_ = CTA_K / 32 * m_ - kIterK * kWarpAccessK * m_;
-
-        const int dst_offset_m = cta_thread_offset_m;
-        const int dst_offset_k = cta_thread_offset_k;
-
-        dst_offset_ = dst_offset_k * kShapeM + dst_offset_m;
-        dst_step_m_ = kWarpAccessM;
-        dst_step_k_ = kWarpAccessK * kShapeM - kIterM * kWarpAccessM;
-        dst_step_s_ = SLICE_K / 32 * kShapeM - kIterK * kWarpAccessK * kShapeM;
-
-        dst_offset_ *= kAccessSize;
-        dst_step_m_ *= kAccessSize;
-        dst_step_k_ *= kAccessSize;
-        dst_step_s_ *= kAccessSize;
-    }
-
-    __device__ void prefetch_stage(bool mask)
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < kIterCount; ++i) {
-            prefetch(mask);
-            ++(*this);
-        }
-        next_stage();
-    }
-
-    __device__ void prefetch_batch(int batch_idx, int batch_size, bool mask)
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < batch_size; ++i) {
-            if (batch_idx * batch_size + i < kIterCount) {
-                prefetch(mask);
-                ++(*this);
-            }
-        }
-    }
-
-    __device__ IteratorA& operator++()
-    {
-        src_offset_ += src_step_m_;
-        dst_offset_ += dst_step_m_;
-        ++iter_m_;
-        if (iter_m_ < kIterM) {
-            return *this;
-        }
-        iter_m_ = 0;
-        src_offset_ += src_step_k_;
-        dst_offset_ += dst_step_k_;
-
-        return *this;
-    }
-
-    __device__ void next_stage()
-    {
-        src_offset_ += src_step_s_;
-        dst_offset_ += dst_step_s_;
-
-        if (dst_offset_ >= kSmemByteSize) {
-            dst_offset_ -= kSmemByteSize;
-        }
-    }
-
-    __device__ void prefetch(bool mask)
-    {
-#if TURBOMIND_ARCH_SM80
-        cp_async_cg_A(smem_int_ptr_ + dst_offset_, (const AccessType*)src_ + src_offset_, mask);
-#else
-        if (mask) {
-            *(AccessType*)((uint8_t*)smem_ + dst_offset_) = __ldg((const AccessType*)src_ + src_offset_);
-        }
-#endif
-    }
-};
-
-template<int WARPS, int CTA_M, int CTA_N, int CTA_K, int STAGES, int SLICES, int GROUP_SIZE>
-struct IteratorQ {
-    static constexpr int SLICE_K = CTA_K / SLICES;
-
-    using AccessType                 = uint;
-    static constexpr int kAccessSize = sizeof(AccessType);
-
-    static constexpr int kAccessM = kAccessSize / sizeof(half2);
-    static constexpr int kAccessK = GROUP_SIZE;
-
-    // warp thread arrangement
-    static constexpr int kWarpThreadC = 32;
-    static constexpr int kWarpThreadS = 1;
-
-    // warp shape per access
-    static constexpr int kWarpAccessM = kWarpThreadC * kAccessM;  // 32
-    static constexpr int kWarpAccessK = kWarpThreadS * kAccessK;  // GROUP_SIZE
-
-    // warp access iterations
-    static constexpr int kWarpIterM = CTA_M / kWarpAccessM;    // CTA_M / 32
-    static constexpr int kWarpIterK = SLICE_K / kWarpAccessK;  // SLICE_K / GROUP_SIZE, maybe 0
-
-    // kWarpIterK == 0 => SLICE_K < kWarpAccessK => kIterK == 1
-
-    // warp arrangement
-    static constexpr int kWarpM = kWarpIterM >= WARPS ? WARPS : kWarpIterM;
-    static constexpr int kWarpK = WARPS > kWarpIterM ? WARPS / kWarpM : 1;
-
-    // iterations
-    static constexpr int kIterM     = kWarpIterM / kWarpM;
-    static constexpr int kIterK     = kWarpIterK >= kWarpK ? kWarpIterK / kWarpK : 1;
-    static constexpr int kIterCount = kIterM * kIterK;
-
-    // warp footprint
-    static constexpr int kWarpFootprintM = kWarpAccessM * kIterM;
-    static constexpr int kWarpFootprintK = kWarpAccessK * kIterK;
-
-    static constexpr int kSizePerStage = std::max(SLICE_K / GROUP_SIZE, 1) * CTA_M;
-    static constexpr int kSmemByteSize = sizeof(uint) * STAGES * kSizePerStage;
-
-    const half2* const src_;
-    half2* const       smem_;
-    uint32_t const     smem_int_ptr_;
-
-    const int m_;
-    const int k_;
-
-    bool is_out_of_bound_;  // mask for out-of-bound warps
-
-    int src_offset_k_;
-    int src_offset_m_;
-
-    int src_offset_;
-    int src_step_m_;
-    int src_step_k_;
-
-    int dst_offset_;
-    int dst_step_m_;
-    int dst_step_k_;
-
-    int tmp_src_offset_;
-    int tmp_dst_offset_;
-
-    int iter_m_{0};
-
-    struct Storage {
-        half2 data[SLICES][STAGES * kSizePerStage];
-    };
-
-    IteratorQ() = default;
-
-    __device__ IteratorQ(const half2* src, half2* smem, int m, int k, int cta_m, int cta_k, int warp_id, int lane_id):
-        src_(src), smem_(smem), smem_int_ptr_(cast_smem_ptr_to_uint(smem)), m_(m), k_(k)
-    {
-        const int warp_offset_m = warp_id % kWarpM;
-        const int warp_offset_k = warp_id / kWarpM;
-
-        const int warp_thread_offset_m = lane_id % kWarpThreadC;
-        const int warp_thread_offset_k = lane_id / kWarpThreadC;
-
-        const int cta_thread_offset_m = kWarpFootprintM * warp_offset_m + warp_thread_offset_m * kAccessM;
-        const int cta_thread_offset_k = kWarpFootprintK * warp_offset_k + warp_thread_offset_k * kAccessK;
-
-        // mask out-of-bound warps
-        is_out_of_bound_ = cta_thread_offset_k >= SLICE_K;
-
-        src_offset_m_ = cta_thread_offset_m + cta_m;
-        src_offset_k_ = cta_thread_offset_k + cta_k;
-
-        src_offset_ = src_offset_k_ / GROUP_SIZE * m_ + src_offset_m_;
-        src_step_m_ = kWarpAccessM;
-        src_step_k_ = m_ - kIterM * kWarpAccessM;  // valid only when SLICE_K >= GROUP_SIZE
-
-        const int dst_offset_m = cta_thread_offset_m;
-        const int dst_offset_k = cta_thread_offset_k;
-
-        dst_offset_ = dst_offset_k / GROUP_SIZE * CTA_M + dst_offset_m;
-        dst_step_m_ = kWarpAccessM;
-        dst_step_k_ = CTA_M - kIterM * kWarpAccessM;  // valid only when SLICE_K >= GROUP_SIZE
-
-        dst_offset_ *= kAccessSize;
-        dst_step_m_ *= kAccessSize;
-        dst_step_k_ *= kAccessSize;
-
-        tmp_src_offset_ = src_offset_;
-        tmp_dst_offset_ = dst_offset_;
-    }
-
-    __device__ void prefetch_stage(bool mask)
-    {
-        if (is_out_of_bound_) {
-            return;
-        }
-
-        PRAGMA_UNROLL
-        for (int i = 0; i < kIterCount; ++i) {
-            prefetch(mask);
-            ++(*this);
-        }
-        next_stage();
-    }
-
-    __device__ void prefetch_batch(int batch_idx, int batch_size, bool mask)
-    {
-        if (is_out_of_bound_) {
-            return;
-        }
-
-        PRAGMA_UNROLL
-        for (int i = 0; i < batch_size; ++i) {
-            if (batch_idx * batch_size + i < kIterCount) {
-                prefetch(mask);
-                ++(*this);
-            }
-        }
-    }
-
-    __device__ IteratorQ& operator++()
-    {
-        ++iter_m_;
-
-        src_offset_ += src_step_m_;
-        dst_offset_ += dst_step_m_;
-        if (iter_m_ < kIterM) {
-            return *this;
-        }
-
-        iter_m_ = 0;
-
-        if constexpr (SLICE_K >= GROUP_SIZE) {
-            src_offset_ += src_step_k_;
-            dst_offset_ += dst_step_k_;
-        }
-        // else advnace offsets in `next_stage`
-
-        return *this;
-    }
-
-    __device__ void next_stage()
-    {
-        if constexpr (SLICE_K >= GROUP_SIZE) {
-            src_offset_ += (CTA_K / GROUP_SIZE - kIterK) * m_;
-            dst_offset_ += kAccessSize * (SLICE_K / GROUP_SIZE - kIterK) * CTA_M;
-        }
-        else {  // SLICE_K < GROUP_SIZE, recompute `src_offset_`
-            src_offset_k_ += CTA_K;
-            src_offset_ = (src_offset_k_ / GROUP_SIZE) * m_ + src_offset_m_;
-            dst_offset_ += dst_step_k_;
-        }
-
-        if (dst_offset_ >= kSmemByteSize) {
-            dst_offset_ -= kSmemByteSize;
-        }
-    }
-
-    __device__ void prefetch(bool mask)
-    {
-#if TURBOMIND_ARCH_SM80
-        cp_async_ca(smem_int_ptr_ + dst_offset_, (const AccessType*)src_ + src_offset_, mask);
-#else
-        if (mask) {
-            *(AccessType*)((uint8_t*)smem_ + dst_offset_) = __ldg((const AccessType*)src_ + src_offset_);
-        }
-#endif
-    }
-};
-
-template<int WARPS, int CTA_M, int CTA_N, int CTA_K, int STAGES, int SLICES>
-struct IteratorB {
-
-    static constexpr int SLICE_K      = CTA_K / SLICES;
-    static constexpr int kElementSize = sizeof(half);
-    using AccessType                  = uint4;
-    static constexpr int kAccessSize  = sizeof(AccessType);
-
-    static constexpr int kShapeK = SLICE_K;
-    static constexpr int kShapeN = CTA_N;
-
-    static constexpr int kAccessK = kAccessSize / sizeof(half);
-
-    static_assert(kShapeK % kAccessSize == 0);
-
-    // warp thread arrangement
-    static constexpr int kWarpThreadC = std::max(kShapeK / kAccessK, 1);
-    static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC;
-
-    // warp shape per access
-    static constexpr int kWarpAccessK = kWarpThreadC * kAccessK;
-    static constexpr int kWarpAccessN = kWarpThreadS;
-
-    // warp access iterations
-    static constexpr int kWarpIterK = kShapeK / kWarpAccessK;
-    static constexpr int kWarpIterN = kShapeN / kWarpAccessN;
-
-    // warp arrangement
-    static constexpr int kWarpK = kWarpIterK >= WARPS ? WARPS : kWarpIterK;
-    static constexpr int kWarpN = WARPS > kWarpIterK ? WARPS / kWarpK : 1;
-
-    // iterations
-    static constexpr int kIterK = kWarpIterK / kWarpK;
-    static constexpr int kIterN = kWarpIterN >= kWarpN ? kWarpIterN / kWarpN : 1;
-
-    static constexpr int kIterCount = kIterK * kIterN;
-    static_assert(kIterCount > 0);
-
-    // warp footprint
-    static constexpr int kWarpFootprintK = kWarpAccessK * kIterK;
-    static constexpr int kWarpFootprintN = kWarpAccessN * kIterN;
-
-    // Eliminate bank-conflicts for 8x4 half2 tiles, watch out for misalignment
-    static constexpr int kSmemPadCtaK  = SLICE_K + 8;
-    static constexpr int kSizePerTile  = CTA_N * kSmemPadCtaK;
-    static constexpr int kSmemByteSize = kElementSize * STAGES * kSizePerTile;
-
-    const half*       src_;
-    AccessType* const smem_;  // [CTA_N, SLICE_K + 8]
-    const uint32_t    smem_int_ptr_;
-    const int         k_;
-    const int         n_;
-    const int         cta_n_;
-    const int         warp_id_;
-    const int         lane_id_;
-    const int         c_;
-    const int         s_;
-
-    int src_offset_n_;
-
-    int src_offset_;
-    int dst_offset_;
-
-    int  src_step_k_;
-    int  src_step_n_;
-    int  dst_step_k_;
-    int  dst_step_n_;
-    bool is_valid_n_;
-
-    int tmp_src_offset_;
-    int tmp_dst_offset_;
-    int tmp_src_offset_n_;
-
-    int iter_k_{0};
-    int iter_n_{0};
-
-    // upper bound N
-    int upper_n_;
-
-    IteratorB() = default;
-
-    __device__ IteratorB(const half* src, void* smem, int k, int n, int cta_n, int cta_k, int warp_id, int lane_id):
-        src_(src),
-        smem_((AccessType*)smem),
-        smem_int_ptr_(cast_smem_ptr_to_uint(smem)),
-        k_(k),
-        n_(n),
-        cta_n_(cta_n),
-        warp_id_(warp_id),
-        lane_id_(lane_id),
-        c_(lane_id_ % kWarpThreadC),
-        s_(lane_id_ / kWarpThreadC)
-    {
-
-        const int warp_offset_k = warp_id_ % kWarpK;
-        const int warp_offset_n = warp_id_ / kWarpK;
-
-        const int warp_thread_offset_k = lane_id_ % kWarpThreadC;
-        const int warp_thread_offset_n = lane_id_ / kWarpThreadC;
-
-        const int cta_thread_offset_k = kWarpFootprintK * warp_offset_k + warp_thread_offset_k * kAccessK;
-        const int cta_thread_offset_n = kWarpFootprintN * warp_offset_n + warp_thread_offset_n;
-
-        const int src_offset_k = cta_thread_offset_k + cta_k;
-        src_offset_n_          = cta_thread_offset_n + cta_n_;
-
-        src_offset_ = src_offset_n_ * k_ + src_offset_k;
-
-        const int dst_offset_k = cta_thread_offset_k;
-        const int dst_offset_n = cta_thread_offset_n;
-
-        dst_offset_ = dst_offset_n * kSmemPadCtaK + dst_offset_k;
-
-        src_step_k_ = kWarpAccessK;
-        src_step_n_ = kWarpAccessN * k_ - kIterK * kWarpAccessK;
-
-        dst_step_k_ = kWarpAccessK;
-        dst_step_n_ = kWarpAccessN * kSmemPadCtaK - kIterK * kWarpAccessK;
-
-        dst_offset_ *= kElementSize;
-        dst_step_k_ *= kElementSize;
-        dst_step_n_ *= kElementSize;
-
-        tmp_src_offset_   = src_offset_;
-        tmp_dst_offset_   = dst_offset_;
-        tmp_src_offset_n_ = src_offset_n_;
-
-        // avoid (global mem -> shared mem) WAW(write after write) conflict
-        upper_n_    = std::min(cta_n_ + CTA_N, n_);
-        is_valid_n_ = tmp_src_offset_n_ < upper_n_;
-    }
-
-    __device__ void prefetch_stage(bool mask)
-    {
-
-        PRAGMA_UNROLL
-        for (int i = 0; i < kIterCount; ++i) {
-            prefetch(mask);
-            ++(*this);
-        }
-        next_stage();
-    }
-
-    __device__ void prefetch_batch(int batch_idx, int batch_size, bool mask)
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < batch_size; ++i) {
-            if (batch_idx * batch_size + i < kIterCount) {
-                prefetch(mask);
-                ++(*this);
-            }
-        }
-    }
-
-    __device__ IteratorB& operator++()
-    {
-        if (!is_valid_n_) {
-            return *this;
-        }
-
-        // move to next k
-        tmp_src_offset_ += src_step_k_;
-        tmp_dst_offset_ += dst_step_k_;
-        ++iter_k_;
-        if (iter_k_ < kIterK) {
-            return *this;
-        }
-
-        // move to next n
-        iter_k_ = 0;
-        tmp_src_offset_n_ += kWarpAccessN;
-        tmp_src_offset_ += src_step_n_;
-        tmp_dst_offset_ += dst_step_n_;
-        is_valid_n_ = tmp_src_offset_n_ < upper_n_;
-        ++iter_n_;
-
-        return *this;
-    }
-
-    __device__ void next_stage()
-    {
-        iter_n_ = 0;
-
-        src_offset_ += CTA_K;
-        dst_offset_ += kElementSize * kSizePerTile;
-        if (dst_offset_ >= kSmemByteSize) {
-            dst_offset_ -= kSmemByteSize;
-        }
-
-        tmp_src_offset_   = src_offset_;
-        tmp_dst_offset_   = dst_offset_;
-        tmp_src_offset_n_ = src_offset_n_;
-
-        is_valid_n_ = tmp_src_offset_n_ < upper_n_;
-    }
-
-    __device__ void prefetch(bool mask)
-    {
-#if TURBOMIND_ARCH_SM80
-        cp_async_cg_B(
-            smem_int_ptr_ + tmp_dst_offset_, (const AccessType*)(src_ + tmp_src_offset_), is_valid_n_ && mask);
-#else
-        if (is_valid_n_ && mask) {
-            *(AccessType*)((uint8_t*)smem_ + tmp_dst_offset_) = __ldg((const AccessType*)(src_ + tmp_src_offset_));
-        }
-#endif
-    }
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/format.cu b/src/turbomind/kernels/gemm_s_f16/format.cu
deleted file mode 100644
index c64548d8bf..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/format.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "common.h"
-#include <iostream>
-
-namespace turbomind {
-
-__device__ void atomic_assign_u4(uint32_t* address, uint32_t index, uint32_t value)
-{
-    uint32_t old = *address;
-    uint32_t assumed;
-    do {
-        assumed      = old;
-        uint32_t tmp = (assumed & ~(0xfu << (index * 4u))) | (value << (index * 4u));
-        old          = atomicCAS(address, assumed, tmp);
-    } while (assumed != old);
-}
-
-__device__ uint32_t read_u4(const uint32_t* address, uint32_t index)
-{
-    return (*address >> (index * 4u)) & 0xfu;
-}
-
-template<int... Ds>
-__global__ void permute_u4(uint* dst, const uint* src, Array<int, sizeof...(Ds)> dims)
-{
-    constexpr int N = sizeof...(Ds);
-
-    size_t count = 1;
-    PRAGMA_UNROLL
-    for (int i = 0; i < N; ++i) {
-        count *= dims[i];
-    }
-
-    constexpr int order[] = {Ds...};
-
-    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
-
-        int indices[N]{};
-
-        PRAGMA_UNROLL
-        for (int j = N - 1, ii = i; j >= 0; --j) {
-            indices[j] = ii % dims[j];
-            ii /= dims[j];
-        }
-
-        auto data = read_u4(src + i / 8, i % 8);
-
-        int index = 0;
-
-        PRAGMA_UNROLL
-        for (int j = N - 1, stride = 1; j >= 0; --j) {
-            index += indices[order[j]] * stride;
-            stride *= dims[order[j]];
-        }
-
-        atomic_assign_u4(dst + index / 8, index % 8, data);
-    }
-}
-
-void reformat_s4_k8_m(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
-{
-    // permutation for [k/8, m] layout
-    Array<int, 10> shape{k / 32, 2, 2, m / 32, 2, 2, 8, 2, 2, 2};
-    //        |warp|  lane  | 2x2 |  a0-7  |
-    permute_u4<0, 3, 6, 8, 9, 1, 4, 7, 2, 5><<<512, 512, 0, st>>>(dst, src, shape);
-}
-
-void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
-{
-    // permutation for [k, m/8] layout
-    Array<int, 10> shape{k / 32, 2, 2, 4, 2, m / 32, 2, 2, 2, 4};
-    //        |warp|  lane  | 2x2 |  a0-7  |
-    permute_u4<0, 5, 9, 8, 3, 1, 6, 4, 2, 7><<<512, 512, 0, st>>>(dst, src, shape);
-}
-
-__global__ void dequantize_s4_offset_64(uint4* dst, const uint32_t* src, size_t count)
-{
-    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
-        dst[i] = dequantize_s4_to_fp16x2_v2(src[i]);
-    }
-}
-
-__global__ void merge_Q(half2* Q, const half* scales, const half* zeros, int count)
-{
-    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
-        if (TURBOMIND_S4_DEQUANT_USE_FMA) {
-            // dequant via HFMA2 has numerical statbility issue
-            Q[i] = __halves2half2(-zeros[i] * scales[i], scales[i]);
-        }
-        else {
-            Q[i] = __halves2half2(zeros[i], scales[i]);
-        }
-    }
-}
-
-void convert_s4_k_m8(uint32_t*       A_dst,
-                     half2*          Q_dst,
-                     half*           workspace,
-                     const uint32_t* A_src,
-                     const half*     scales,
-                     const uint32_t* qzeros,
-                     int             m,
-                     int             k,
-                     int             group_size,
-                     cudaStream_t    st)
-{
-    dequantize_s4_offset_64<<<256, 256, 0, st>>>((uint4*)workspace, qzeros, k / group_size * m / 8);
-
-    merge_Q<<<256, 256, 0, st>>>(Q_dst, scales, workspace, k / group_size * m);
-
-    reformat_s4_k_m8(A_dst, A_src, m, k, st);
-}
-
-void transpose_qk_s4_k_m8_hf(uint32_t* dst, const uint32_t* src, int m, int k, int size_per_head, cudaStream_t st)
-{
-    Array<int, 7> shape{k, m / size_per_head, 2, size_per_head / 2 / 8, 2, 2, 2};
-    //      dequant   transpose    quant
-    // 0123456 -> 0123564 -> 0135642 -> 0135264
-    permute_u4<0, 1, 3, 5, 2, 6, 4><<<512, 512, 0, st>>>(dst, src, shape);
-}
-
-// [2, k, m/8] -> [k, m/8, 2]
-void fuse_w1_w3_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
-{
-    Array<int, 6> shape{2, k, m / 8, 2, 2, 2};
-    //     dequant   transpose   quant
-    // 012345 -> 012453 -> 124530 -> 124053
-    permute_u4<1, 2, 4, 0, 5, 3><<<512, 512, 0, st>>>(dst, src, shape);
-}
-
-__global__ void dequantize_s4_kernel(uint4* dst, const uint* src, size_t count)
-{
-    for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
-        dst[i] = dequantize_s4_to_fp16x2(src[i]);
-    }
-}
-
-void dequantize_s4(uint4* dst, const uint32_t* src, size_t count, cudaStream_t st)
-{
-    dequantize_s4_kernel<<<512, 512>>>(dst, src, count);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/format.h b/src/turbomind/kernels/gemm_s_f16/format.h
deleted file mode 100644
index 053d2e8687..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/format.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include <cstdint>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-void reformat_s4_k8_m(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st = {});
-
-void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st = {});
-
-void convert_s4_k_m8(uint32_t*       A_dst,
-                     half2*          Q_dst,
-                     half*           workspace,
-                     const uint32_t* A_src,
-                     const half*     scales,
-                     const uint32_t* qzeros,
-                     int             m,
-                     int             k,
-                     int             group_size,
-                     cudaStream_t    st = {});
-
-void transpose_qk_s4_k_m8_hf(uint32_t* dst, const uint32_t* src, int m, int k, int size_per_head, cudaStream_t st = {});
-
-void fuse_w1_w3_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st = {});
-
-void dequantize_s4(uint4* dst, const uint32_t* src, size_t count, cudaStream_t st = {});
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.cu b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.cu
deleted file mode 100644
index 70c2e81ab4..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.cu
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "gemm_s4_f16.h"
-#include "gemm_s4_f16_kernel.h"
-#include "metric.h"
-#include <algorithm>
-#include <iomanip>
-#include <ios>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <stdexcept>
-#include <tuple>
-#include <vector>
-
-namespace turbomind {
-
-bool g_dump_kernel_info_once = false;
-
-namespace ops {
-
-struct Identity {
-    static __inline__ __device__ void apply(uint data, int m, int n, half* C, int M, int N)
-    {
-        if (n < N) {
-            (uint&)C[n * M + m] = (uint&)data;
-        }
-    }
-};
-
-struct SiluActivation {
-    static __inline__ __device__ void apply(uint data, int m, int n, half* C, int M, int N)
-    {
-        auto  u    = __half22float2((half2&)data);
-        float silu = u.x / (1.f + __expf(-u.x));
-        half  val  = __float2half_rn(silu * u.y);
-
-        if (n < N) {
-            C[n * (M / 2) + m / 2] = val;
-        }
-    }
-};
-
-struct Add {
-    static __inline__ __device__ void apply(uint data, int m, int n, half* C, int M, int N)
-    {
-        if (n < N) {
-            C[n * M + m] += ((half2&)data).x;
-            C[n * M + m + 1] += ((half2&)data).y;
-        }
-    }
-};
-
-}  // namespace ops
-
-template<typename... Ts>
-struct OutputOps {
-
-    template<int index>
-    static __inline__ __device__ void apply(uint data, int m, int n, half* C, int M, int N)
-    {
-        std::tuple_element_t<index, std::tuple<Ts...>>::apply(data, m, n, C, M, N);
-    }
-};
-
-struct GemmS4F16::Impl {
-
-    using Kernels = std::vector<std::unique_ptr<IGemmKernel>>;
-
-    template<int GS, typename Op>
-    void Generate(std::vector<Kernels>& kernels)
-    {
-        // smem size (KB):
-        // sm75: 64
-        // sm80: 163
-        // sm86: 99
-        // sm89: 99
-        // sm90: 227
-
-        Kernels k;
-
-        // 256
-        k.emplace_back(new GemmKernel<Shape<256, 128, 32>, Shape<32, 128, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<256, 64, 64>, Shape<64, 64, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<256, 64, 32>, Shape<64, 64, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<256, 32, 64>, Shape<64, 32, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<256, 16, 256>, Shape<32, 16, 128>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<256, 8, 256>, Shape<32, 8, 128>, 3, GS, Op>{});
-
-        // 128
-        k.emplace_back(new GemmKernel<Shape<128, 128, 64>, Shape<32, 128, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 128, 32>, Shape<32, 128, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 96, 64>, Shape<32, 96, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 64, 64>, Shape<32, 64, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 64, 32>, Shape<32, 64, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 32, 128>, Shape<32, 32, 64>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 16, 256>, Shape<32, 16, 64>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 8, 512>, Shape<32, 8, 128>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<128, 8, 512>, Shape<32, 8, 128>, 2, GS, Op>{});  // for 86/89
-
-        // 64
-        k.emplace_back(new GemmKernel<Shape<64, 16, 256>, Shape<32, 16, 32>, 3, GS, Op>{});
-        k.emplace_back(new GemmKernel<Shape<64, 8, 256>, Shape<32, 8, 32>, 3, GS, Op>{});
-
-        kernels.push_back(std::move(k));
-    }
-
-    void Measure(half*                 C,
-                 const uint*           A,
-                 const half*           B,
-                 const half2*          Q,
-                 int                   m,
-                 int                   n,
-                 int                   k,
-                 int                   group_size,
-                 Type                  type,
-                 std::vector<Metric>&  metrics,
-                 cudaStream_t          st,
-                 std::vector<Kernels>& _kernels)
-    {
-        int gid = -1;
-        for (size_t i = 0; i < group_sizes_.size(); ++i) {
-            if (group_sizes_[i] == group_size) {
-                gid = i;
-                break;
-            }
-        }
-        if (gid < 0) {
-            throw std::runtime_error("unsupported group size");
-        }
-        const auto& kernels = _kernels[gid];
-        metrics             = std::vector<Metric>(kernels.size());
-
-        int best = 0;
-
-        for (size_t i = 0; i < kernels.size(); ++i) {
-            metrics[i].id = i;
-            kernels[i]->GetMetric(metrics[i], m, n, k);
-            if (!metrics[i].feasible) {
-                metrics[i].time  = std::numeric_limits<float>::infinity();
-                metrics[i].count = 1;
-                continue;
-            }
-            if (Compare(metrics[i], metrics[best])) {
-                best = i;
-            }
-            for (size_t j = 0; j < kWarmup + kMeasure; ++j) {
-                if (j == kWarmup) {
-                    cudaEventRecord(ev_start_, st);
-                }
-                kernels[i]->Launch(C, A, B, Q, m, n, k, type, st);
-            }
-            cudaEventRecord(ev_end_, st);
-            cudaEventSynchronize(ev_end_);
-            float ms{};
-            cudaEventElapsedTime(&ms, ev_start_, ev_end_);
-            metrics[i].time  = ms;
-            metrics[i].count = kMeasure;
-        }
-
-        metrics[best].best = 1;
-
-        // sort metrics
-        std::vector<int> indices(kernels.size());
-        std::iota(indices.begin(), indices.end(), 0);
-        std::stable_sort(
-            indices.begin(), indices.end(), [&](int i, int j) { return metrics[i].time < metrics[j].time; });
-
-        if (g_dump_kernel_info_once) {
-            DumpMetrics(std::cerr, metrics, indices);
-            g_dump_kernel_info_once = 0;
-        }
-
-        std::vector<Metric> tmp;
-        for (size_t i = 0; i < indices.size(); ++i) {
-            tmp.push_back(metrics[indices[i]]);
-        }
-        metrics.swap(tmp);
-    }
-
-    static bool Compare(const Metric& a, const Metric& b)
-    {
-        if (a.feasible != b.feasible) {
-            return a.feasible > b.feasible;
-        }
-
-        if (a.prefer != b.prefer) {
-            return a.prefer > b.prefer;
-        }
-
-        return a.grid_norm < b.grid_norm;
-    }
-
-    int Estimate(int m, int n, int k, Kernels& kernels)
-    {
-        int                 best = 0;
-        std::vector<Metric> metrics(kernels.size());
-        for (size_t i = 0; i < kernels.size(); ++i) {
-            metrics[i].id = i;
-            kernels[i]->GetMetric(metrics[i], m, n, k);
-            if (Compare(metrics[i], metrics[best])) {
-                best = i;
-            }
-        }
-
-        if (g_dump_kernel_info_once) {
-            std::vector<int> indices(kernels.size());
-            std::iota(indices.begin(), indices.end(), 0);
-            std::stable_sort(
-                indices.begin(), indices.end(), [&](int i, int j) { return Compare(metrics[i], metrics[j]); });
-            DumpMetrics(std::cerr, metrics, indices);
-            g_dump_kernel_info_once = 0;
-        }
-
-        return best;
-    }
-
-    void Run(half*                 C,
-             const uint*           A,
-             const half*           B,
-             const half2*          Q,
-             int                   m,
-             int                   n,
-             int                   k,
-             int                   group_size,
-             Type                  type,
-             int                   algo_id,
-             cudaStream_t          st,
-             std::vector<Kernels>& kernels)
-    {
-        for (size_t i = 0; i < group_sizes_.size(); ++i) {
-            if (group_sizes_[i] == group_size) {
-                if (algo_id < 0) {
-                    algo_id = Estimate(m, n, k, kernels[i]);
-                }
-                if (algo_id < 0) {
-                    throw std::runtime_error("no feasible kernel found");
-                }
-                kernels[i].at(algo_id)->Launch(C, A, B, Q, m, n, k, type, st);
-                return;
-            }
-        }
-        throw std::runtime_error("unsupported group size");
-    }
-
-    Impl()
-    {
-        cudaEventCreate(&ev_start_);
-        cudaEventCreate(&ev_end_);
-
-        using Ops = OutputOps<ops::Identity, ops::SiluActivation, ops::Add>;
-
-        /// TODO: add more group sizes
-        Generate<128, Ops>(kernels_);
-        group_sizes_.push_back(128);
-    }
-
-    ~Impl()
-    {
-        cudaEventDestroy(ev_end_);
-        cudaEventDestroy(ev_start_);
-    }
-
-    std::vector<Kernels> kernels_;
-
-    std::vector<int> group_sizes_;
-
-    static constexpr int kWarmup  = 10;
-    static constexpr int kMeasure = 100;
-
-    cudaEvent_t ev_start_{};
-    cudaEvent_t ev_end_{};
-};
-
-GemmS4F16::GemmS4F16(): impl_(std::make_unique<Impl>()) {}
-
-GemmS4F16::~GemmS4F16() = default;
-
-void GemmS4F16::Measure(half*                C,
-                        const uint*          A,
-                        const half*          B,
-                        const half2*         Q,
-                        int                  m,
-                        int                  n,
-                        int                  k,
-                        int                  group_size,
-                        Type                 type,
-                        std::vector<Metric>& metrics,
-                        cudaStream_t         st)
-{
-    impl_->Measure(C, A, B, Q, m, n, k, group_size, type, metrics, st, impl_->kernels_);
-}
-
-void GemmS4F16::Run(half*        C,
-                    const uint*  A,
-                    const half*  B,
-                    const half2* Q,
-                    int          m,
-                    int          n,
-                    int          k,
-                    int          group_size,
-                    Type         type,
-                    int          algo_id,
-                    cudaStream_t st)
-{
-    impl_->Run(C, A, B, Q, m, n, k, group_size, type, algo_id, st, impl_->kernels_);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
deleted file mode 100644
index 018c5be8d4..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "metric.h"
-#include "src/turbomind/macro.h"
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <memory>
-#include <vector>
-
-namespace turbomind {
-
-extern bool g_dump_kernel_info_once;
-
-class GemmS4F16 {
-public:
-    GemmS4F16();
-
-    ~GemmS4F16();
-
-    enum Type
-    {
-        kGemm,
-        kFusedSiluFfn,
-        kFusedAdd
-    };
-
-    void Measure(half*                C,
-                 const uint*          A,
-                 const half*          B,
-                 const half2*         Q,
-                 int                  m,
-                 int                  n,
-                 int                  k,
-                 int                  group_size,
-                 Type                 type,
-                 std::vector<Metric>& metrics,
-                 cudaStream_t         st);
-
-    void Run(half*        C,
-             const uint*  A,
-             const half*  B,
-             const half2* Q,
-             int          m,
-             int          n,
-             int          k,
-             int          group_size,
-             Type         type,
-             int          algo_id,
-             cudaStream_t st);
-
-private:
-    struct Impl;
-    std::unique_ptr<Impl> impl_;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16_kernel.h b/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16_kernel.h
deleted file mode 100644
index 0f6fc61c8c..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/gemm_s4_f16_kernel.h
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "gemm_template.h"
-
-#include "metric.h"
-#include <iostream>
-#include <memory>
-#include <sstream>
-
-namespace turbomind {
-
-struct IGemmKernel {
-
-    virtual ~IGemmKernel() = default;
-
-    virtual void GetMetric(Metric& metric, int m, int n, int k) = 0;
-
-    virtual void Launch(half*        C,
-                        const uint*  A,
-                        const half*  B,
-                        const half2* Q,
-                        int          M,
-                        int          N,
-                        int          K,
-                        int          output_op_idx,
-                        cudaStream_t) = 0;
-
-    virtual void Dump(std::ostream& os) = 0;
-};
-
-template<typename CtaShape, typename WarpShape, int Stages, int GroupSize, typename OutputOps>
-struct GemmKernel: public IGemmKernel {
-
-    static constexpr CtaShape  cta_shape{};
-    static constexpr WarpShape warp_shape{};
-
-    using GemmType = Gemm<cta_shape.m(),
-                          cta_shape.n(),
-                          cta_shape.k(),
-                          warp_shape.m(),
-                          warp_shape.n(),
-                          warp_shape.k(),
-                          Stages,
-                          GroupSize,
-                          OutputOps>;
-
-    decltype(&gemm_s4_f16_nn<GemmType>) kernel_func_;
-    std::shared_ptr<cudaDeviceProp>     props_;
-    int                                 max_active_ctas_{};
-
-    static constexpr int kSlices       = GemmType::SLICES;
-    static constexpr int kSmemSizeA    = GemmType::IteratorA::kSmemByteSize * kSlices;
-    static constexpr int kSmemSizeB    = GemmType::IteratorB::kSmemByteSize * kSlices;
-    static constexpr int kSmemSizeC    = sizeof(float) * cta_shape.m() * cta_shape.n();
-    static constexpr int kSmemByteSize = std::max(kSmemSizeA + kSmemSizeB, kSmemSizeC);
-
-    // static shared memory size of Q
-    static constexpr int kSmemSizeQ = sizeof(typename GemmType::IteratorQ::Storage);
-
-    explicit GemmKernel(std::shared_ptr<cudaDeviceProp> props = {}): props_(std::move(props))
-    {
-        if (!props_) {
-            props_        = std::make_shared<cudaDeviceProp>();
-            int device_id = -1;
-            cudaGetDevice(&device_id);
-            cudaGetDeviceProperties(props_.get(), device_id);
-        }
-
-        kernel_func_ = gemm_s4_f16_nn<GemmType>;
-        cudaFuncSetAttribute(kernel_func_, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);
-
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_active_ctas_, kernel_func_, GemmType::kWarpCount * WARP_SIZE, kSmemByteSize);
-    };
-
-    bool is_feasible(int m, int n, int k)
-    {
-        return m % cta_shape.m() == 0 && k % cta_shape.k() == 0;
-    }
-
-    void GetMetric(Metric& metric, int m, int n, int k) override
-    {
-        metric.cta_shape  = {cta_shape.m(), cta_shape.n(), cta_shape.k()};
-        metric.warp_shape = {warp_shape.m(), warp_shape.n(), warp_shape.k()};
-        metric.warps      = GemmType::kWarpCount;
-        metric.stages     = Stages;
-        metric.smem       = (kSmemByteSize + kSmemSizeQ) / 1024.f;
-
-        metric.feasible = is_feasible(m, n, k) && max_active_ctas_ > 0;
-
-        metric.prefer = cta_shape.m() != 64 || m <= k;
-
-        if (!metric.feasible) {
-            return;
-        }
-
-        int grid_size    = ((m + cta_shape.m() - 1) / cta_shape.m()) * ((n + cta_shape.n() - 1) / cta_shape.n());
-        metric.grid_size = grid_size;
-
-        metric.max_active_ctas = max_active_ctas_;
-
-        metric.active_ctas =
-            std::min(max_active_ctas_, (grid_size + props_->multiProcessorCount - 1) / props_->multiProcessorCount);
-
-        metric.waves     = (float)grid_size / (props_->multiProcessorCount * metric.active_ctas);
-        metric.occupancy = (metric.active_ctas * GemmType::kWarpCount)
-                           / (float)(props_->maxThreadsPerMultiProcessor / props_->warpSize);
-
-        metric.cta_cnt_m  = (m + cta_shape.m() - 1) / cta_shape.m();
-        metric.cta_cnt_n  = (n + cta_shape.n() - 1) / cta_shape.n();
-        metric.cta_iter_k = (k + cta_shape.k() - 1) / cta_shape.k();
-
-        metric.tile_efficiency = (float)n / (metric.cta_cnt_n * cta_shape.n());
-        metric.wave_efficiency = metric.waves / std::ceil(metric.waves);
-
-        const int m_pad = (m + cta_shape.m() - 1) / cta_shape.m() * cta_shape.m();
-        const int n_pad = (n + cta_shape.n() - 1) / cta_shape.n() * cta_shape.n();
-
-        metric.grid_a0  = 0.25f * m * n_pad / cta_shape.n();       // Ta0 *  M *  [N / ctaN]
-        metric.grid_b0  = 1.00f * n * m_pad / cta_shape.m();       // Tb0 *  N *  [M / ctaM]
-        metric.grid_a1  = 0.65f * m_pad * n_pad / warp_shape.n();  // Ta1 * [M] * [N] / warpN
-        metric.grid_b1  = 0.25f * m_pad * n_pad / warp_shape.m();  // Tb1 * [M] * [N] / warpM
-        metric.grid_mm  = 1.00f * m_pad * n_pad / 64;              // Tm * [M] * [N]
-        metric.grid_sum = metric.grid_a0 + metric.grid_b0 + metric.grid_a1 + metric.grid_b1 + metric.grid_mm;
-
-        metric.cta_sum = metric.grid_sum / grid_size;
-
-        metric.waves1 = (float)grid_size / (props_->multiProcessorCount * metric.active_ctas);
-
-        metric.cta_wave  = std::ceil(metric.waves1) * metric.active_ctas;
-        metric.grid_norm = metric.cta_wave * metric.cta_sum;
-    }
-
-    void Launch(
-        half* C, const uint* A, const half* B, const half2* Q, int M, int N, int K, int output_op_idx, cudaStream_t st)
-        override
-    {
-        constexpr int block_size = GemmType::kWarpCount * WARP_SIZE;
-
-        dim3 grid_size((M + cta_shape.m() - 1) / cta_shape.m(), (N + cta_shape.n() - 1) / cta_shape.n());
-
-        kernel_func_<<<grid_size, block_size, kSmemByteSize, st>>>(C, A, B, Q, M, N, K, output_op_idx);
-    }
-
-    void Dump(std::ostream& os) override
-    {
-        {
-            os << "[Gemm] CTA shape: " << cta_shape.m() << "x" << cta_shape.n() << "x" << cta_shape.k() << std::endl;
-            os << "[Gemm] warp shape: " << warp_shape.m() << "x" << warp_shape.n() << "x" << warp_shape.k()
-               << std::endl;
-            os << "[Gemm] warp count: " << GemmType::kWarpCountM << "x" << GemmType::kWarpCountN << "x"
-               << GemmType::kWarpCountK << " (" << GemmType::kWarpCount << ")" << std::endl;
-            os << std::endl;
-        }
-
-        {
-            using Iter = typename GemmType::IteratorA;
-            os << "[A] shape: " << Iter::kShapeM << " " << Iter::kShapeK << std::endl;
-            os << "[A] warp thread arrangement: " << Iter::kWarpThreadC << " " << Iter::kWarpThreadS << std::endl;
-            os << "[A] warp shape per access: " << Iter::kWarpAccessM << " " << Iter::kWarpAccessK << std::endl;
-            os << "[A] warp access iters: " << Iter::kWarpIterM << " " << Iter::kWarpIterK << std::endl;
-            os << "[A] warp arrangement: " << Iter::kWarpM << " " << Iter::kWarpK << std::endl;
-            os << "[A] iterations: " << Iter::kIterM << " " << Iter::kIterK << std::endl;
-            os << "[A] iters per tile: " << Iter::kIterCount << std::endl;
-            os << "[A] warp footprint: " << Iter::kWarpFootprintM << " " << Iter::kWarpFootprintK << std::endl;
-            os << "[A] shared memory: " << Iter::kSmemByteSize << std::endl;
-            os << std::endl;
-        }
-        {
-            using Iter = typename GemmType::IteratorB;
-            os << "[B] shape: " << Iter::kShapeK << " " << Iter::kShapeN << std::endl;
-            os << "[B] warp thread arrangement: " << Iter::kWarpThreadC << " " << Iter::kWarpThreadS << std::endl;
-            os << "[B] warp shape per access: " << Iter::kWarpAccessK << " " << Iter::kWarpAccessN << std::endl;
-            os << "[B] warp access iters: " << Iter::kWarpIterK << " " << Iter::kWarpIterN << std::endl;
-            os << "[B] warp arrangement: " << Iter::kWarpK << " " << Iter::kWarpN << std::endl;
-            os << "[B] iterations: " << Iter::kIterK << " " << Iter::kIterN << std::endl;
-            os << "[B] iters per tile: " << Iter::kIterCount << std::endl;
-            os << "[B] warp footprint: " << Iter::kWarpFootprintK << " " << Iter::kWarpFootprintN << std::endl;
-            os << "[B] shared memory: " << Iter::kSmemByteSize << std::endl;
-            os << std::endl;
-        }
-        {
-
-            using Iter = typename GemmType::IteratorQ;
-            // os << "[Q] shape: " << CTA_M << " " << Iter::SLICE_K << std::endl;
-            os << "[Q] warp thread arrangement: " << Iter::kWarpThreadC << " " << Iter::kWarpThreadS << std::endl;
-            os << "[Q] warp shape per access: " << Iter::kWarpAccessM << " " << Iter::kWarpAccessK << std::endl;
-            os << "[Q] warp access iters: " << Iter::kWarpIterM << " " << Iter::kWarpIterK << std::endl;
-            os << "[Q] warp arrangement: " << Iter::kWarpM << " " << Iter::kWarpK << std::endl;
-            os << "[Q] iterations: " << Iter::kIterM << " " << Iter::kIterK << std::endl;
-            os << "[Q] iters per tile: " << Iter::kIterCount << std::endl;
-            os << "[Q] warp footprint: " << Iter::kWarpFootprintM << " " << Iter::kWarpFootprintK << std::endl;
-            os << "[Q] size per stage: " << Iter::kSizePerStage << std::endl;
-            os << "[Q] shared memory: " << Iter::kSmemByteSize << std::endl;
-            os << std::endl;
-        }
-        os << "Dynamic shared memory size: " << kSmemByteSize << std::endl;
-    }
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/gemm_template.h b/src/turbomind/kernels/gemm_s_f16/gemm_template.h
deleted file mode 100644
index f30422ac94..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/gemm_template.h
+++ /dev/null
@@ -1,391 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "common.h"
-#include "cta_iterator.h"
-#include "warp_iterator.h"
-#include <cuda_pipeline_primitives.h>
-
-namespace turbomind {
-
-namespace ops {
-
-__inline__ __device__ float4 operator+(const float4& a, const float4& b)
-{
-    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
-}
-
-__inline__ __device__ float2 operator+(const float2& a, const float2& b)
-{
-    return {a.x + b.x, a.y + b.y};
-}
-
-}  // namespace ops
-
-template<int CTA_M,
-         int CTA_N,
-         int CTA_K,
-         int WARP_M,
-         int WARP_N,
-         int WARP_K,
-         int STAGES,
-         int GROUP_SIZE,
-         typename OutputOps>
-struct Gemm {
-
-    static constexpr int kWarpCountM = CTA_M / WARP_M;
-    static constexpr int kWarpCountN = CTA_N / WARP_N;
-    static constexpr int kWarpCountK = CTA_K / WARP_K;
-
-    static constexpr int kWarpCountMN = kWarpCountM * kWarpCountN;
-    static constexpr int kWarpCount   = kWarpCountMN * kWarpCountK;
-
-    static constexpr int SLICES  = kWarpCountK;
-    static constexpr int SLICE_K = CTA_K / SLICES;
-
-    static_assert(SLICE_K % WARP_K == 0, "infeasible sliced-k setting");
-
-    using IteratorA = turbomind::IteratorA<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES>;
-    using IteratorQ = turbomind::IteratorQ<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES, GROUP_SIZE>;
-    using IteratorB = turbomind::IteratorB<kWarpCountMN, CTA_M, CTA_N, CTA_K, STAGES, SLICES>;
-
-    static constexpr int OP_M = 16;
-    static constexpr int OP_N = 8;
-    static constexpr int OP_K = 16;
-
-    using WarpIterA = turbomind::WarpIteratorA<CTA_M,
-                                               CTA_K,
-                                               WARP_M,
-                                               WARP_K,
-                                               OP_M,
-                                               OP_K,
-                                               GROUP_SIZE,
-                                               STAGES,
-                                               IteratorA::kSizePerStage,
-                                               IteratorQ::kSizePerStage>;
-
-    using WarpIterB =
-        turbomind::WarpIteratorB<CTA_N, CTA_K, WARP_N, WARP_K, OP_N, OP_K, IteratorB::kSmemPadCtaK, STAGES>;
-
-    __device__ void warp_mma(IteratorA& iter_A,
-                             IteratorQ& iter_Q,
-                             IteratorB& iter_B,
-                             WarpIterA& warp_iter_A,
-                             WarpIterB& warp_iter_B,
-                             float*     accum,
-                             int        slice_id,
-                             int&       gemm_iter)
-    {
-
-        constexpr int ITER_M = WARP_M / OP_M;
-        constexpr int ITER_N = WARP_N / OP_N;
-        constexpr int ITER_K = WARP_K / OP_K;
-
-        constexpr int kBatchA = (IteratorA::kIterCount + ITER_K - 1) / ITER_K;
-        constexpr int kBatchQ = (IteratorQ::kIterCount + ITER_K - 1) / ITER_K;
-        constexpr int kBatchB = (IteratorB::kIterCount + ITER_K - 1) / ITER_K;
-
-        auto frag_C_ptr = (Array<float, 4>*)accum;  // [ITER_N, ITER_M]
-
-        PRAGMA_UNROLL
-        for (int iter_k = 0; iter_k < ITER_K; ++iter_k) {
-
-            warp_iter_A.load(warp_frag_A_[(iter_k + 1) % 2], (iter_k + 1) % ITER_K);
-            warp_iter_B.load(warp_frag_B_[(iter_k + 1) % 2], (iter_k + 1) % ITER_K);
-
-            auto warp_frag_A = warp_frag_A_[iter_k % 2];
-            auto warp_frag_B = warp_frag_B_[iter_k % 2];
-
-            PRAGMA_UNROLL
-            for (int iter_m = 0; iter_m < ITER_M; ++iter_m) {
-                PRAGMA_UNROLL
-                for (int iter_n = 0; iter_n < ITER_N; ++iter_n) {
-                    auto& frag_A = warp_frag_A[iter_m];
-                    auto& frag_B = warp_frag_B[iter_n];
-                    auto& frag_C = frag_C_ptr[iter_n * ITER_M + iter_m];
-                    mma_m16n8k16_row_col(frag_C, frag_A, frag_B, frag_C);
-                }
-            }
-
-            if (iter_k < ITER_K - 1) {
-                iter_A.prefetch_batch(iter_k, kBatchA, gemm_iter > 0);
-                iter_Q.prefetch_batch(iter_k, kBatchQ, gemm_iter > 0);
-                iter_B.prefetch_batch(iter_k, kBatchB, gemm_iter > 0);
-            }
-
-            if (iter_k == ITER_K - 2) {
-                iter_A.prefetch_batch(iter_k + 1, kBatchA, gemm_iter > 0);
-                iter_Q.prefetch_batch(iter_k + 1, kBatchQ, gemm_iter > 0);
-                iter_B.prefetch_batch(iter_k + 1, kBatchB, gemm_iter > 0);
-
-                __pipeline_commit();
-                __pipeline_wait_prior(STAGES - 2);
-                sync_slice(slice_id);
-
-                iter_A.next_stage();
-                iter_Q.next_stage();
-                iter_B.next_stage();
-
-                warp_iter_A.next_stage();
-                warp_iter_B.next_stage();
-
-                --gemm_iter;
-            }
-        }
-    }
-
-    template<typename T, int N>
-    __device__ static void copy(T (&dst)[N], const T (&src)[N])
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; ++i) {
-            dst[i] = src[i];
-        }
-    }
-
-    template<typename T, int N>
-    __device__ static void clear(T (&dst)[N])
-    {
-        PRAGMA_UNROLL
-        for (int i = 0; i < N; ++i) {
-            dst[i] = T{};
-        }
-    }
-
-    __device__ void sync_slice(int slice_id)
-    {
-        if constexpr (SLICES == 1) {
-            __syncthreads();
-        }
-        else {
-            constexpr int      SLICE_GROUP = (SLICES + 7) / 8;
-            constexpr uint32_t num_threads = kWarpCountMN * WARP_SIZE;
-            const uint32_t     barrier_id  = slice_id / SLICE_GROUP + 1;
-            asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "n"(num_threads));
-        }
-    }
-
-    __device__ void load_partial(float* tb_frag_C, const float* partial_C, int cta, int slice_id)
-    {
-        if (slice_id == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < CTA_N; ++i) {
-                tb_frag_C[i] += partial_C[cta * CTA_N * CTA_M + i * CTA_M + threadIdx.x];
-            }
-        }
-    }
-
-    __device__ void store_partial(float* partial_C, const float* tb_frag_C, int cta, int slice_id)
-    {
-        if (slice_id == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < CTA_N; ++i) {
-                partial_C[cta * CTA_N * CTA_M + i * CTA_M + threadIdx.x] = tb_frag_C[i];
-            }
-        }
-    }
-
-    template<int Index>
-    __device__ void store_accum(float* tb_frag_C,
-                                float* tb_smem_C,
-                                half*  C,
-                                int    m,
-                                int    n,
-                                int    cta_m,
-                                int    cta_n,
-                                int    warp_id_m,
-                                int    warp_id_n,
-                                int    lane_id,
-                                int    slice_id)
-    {
-
-        if (slice_id != 0) {
-            return;
-        }
-
-        // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#mma-16816-c
-        PRAGMA_UNROLL
-        for (int i = 0; i < WARP_N / OP_N; ++i) {
-            const float2* frag_C = (float2*)&tb_frag_C[i * WARP_M / OP_M * 4];
-            const int     nn     = cta_n + warp_id_n * WARP_N + i * OP_N + lane_id / 4;
-            PRAGMA_UNROLL
-            for (int j = 0; j < WARP_M / OP_M; ++j) {
-                PRAGMA_UNROLL
-                for (int x = 0; x < 2; ++x) {
-                    const int mm = cta_m + warp_id_m * WARP_M + j * OP_M + x * 8 + lane_id % 4 * 2;
-                    // convert to half
-                    half2 half_C = __float22half2_rn(frag_C[j * 2 + x]);
-                    // transpose 8x8 accum tile
-                    uint trans_C = transpose_m8n8_b16((uint&)half_C);
-                    // store to global memory
-                    OutputOps::template apply<Index>(trans_C, mm, nn, C, m, n);
-                }
-            }
-        }
-    }
-
-    __device__ void
-    sum_slices(float* tb_frag_C, float* tb_smem_C, int warp_id_m, int warp_id_n, int lane_id, int slice_id)
-    {
-
-        int offset_m = warp_id_m * WARP_M / OP_M;
-        int offset_n = warp_id_n * WARP_N / OP_N;
-
-        PRAGMA_UNROLL
-        for (int z = 0; z < SLICES; ++z) {
-            if (slice_id == z) {
-                PRAGMA_UNROLL
-                for (int i = 0; i < WARP_N / OP_N; ++i) {
-                    PRAGMA_UNROLL
-                    for (int j = 0; j < WARP_M / OP_M; ++j) {
-                        PRAGMA_UNROLL
-                        for (int x = 0; x < 4; ++x) {
-                            int src = (i * WARP_M / OP_M + j) * 4 + x;
-                            int dst = ((i + offset_n) * CTA_M / OP_M + j + offset_m) * 4 + x;
-                            if (z > 0) {
-                                using namespace ops;
-                                tb_frag_C[src] = tb_smem_C[dst * WARP_SIZE + lane_id] + tb_frag_C[src];
-                            }
-                            tb_smem_C[dst * WARP_SIZE + lane_id] = tb_frag_C[src];
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-        }
-
-        if (slice_id == 0) {
-            PRAGMA_UNROLL
-            for (int i = 0; i < WARP_N / OP_N; ++i) {
-                PRAGMA_UNROLL
-                for (int j = 0; j < WARP_M / OP_M; ++j) {
-                    PRAGMA_UNROLL
-                    for (int x = 0; x < 4; ++x) {
-                        int src = ((i + offset_n) * CTA_M / OP_M + j + offset_m) * 4 + x;
-                        int dst = (i * WARP_M / OP_M + j) * 4 + x;
-
-                        tb_frag_C[dst] = tb_smem_C[src * WARP_SIZE + lane_id];
-                    }
-                }
-            }
-        }
-    }
-
-    Array<half, 8> warp_frag_A_[2][WARP_M / OP_M];
-    Array<half, 4> warp_frag_B_[2][WARP_N / OP_N];
-
-    __device__ void run_v2(half* __restrict__ C,
-                           const uint* __restrict__ A,
-                           const half* __restrict__ B,
-                           const half2* __restrict__ Q,
-                           int M,
-                           int N,
-                           int K,
-                           int output_op_idx)
-    {
-        static_assert(WARP_M % OP_N == 0);
-
-        float tb_frag_C[(WARP_N / OP_N) * (WARP_M / OP_M) * 4];
-
-        extern __shared__ uint8_t smem[];
-
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-
-        const int warp_id_m  = warp_id % kWarpCountM;
-        const int warp_id_nk = warp_id / kWarpCountM;
-        const int warp_id_n  = warp_id_nk % kWarpCountN;
-        const int warp_id_k  = warp_id_nk / kWarpCountN;
-
-        const int warp_id_mn = warp_id_n * kWarpCountM + warp_id_m;
-
-        const int slice_id = warp_id_k;
-
-        const int cta_k = slice_id * SLICE_K;  // sliced-k offset
-        const int cta_m = blockIdx.x * CTA_M;
-        const int cta_n = blockIdx.y * CTA_N;
-
-        // each slice has its own partition of smem
-        uint4* const tb_smem_A = (uint4*)(smem + IteratorA::kSmemByteSize * slice_id);
-        half* const tb_smem_B = (half*)(smem + IteratorA::kSmemByteSize * SLICES + IteratorB::kSmemByteSize * slice_id);
-
-        // [CTA_N / OP_N, CTA_M / OP_M, 4, WARP_SIZE], all mn fragments in CTA
-        float* const tb_smem_C = (float*)smem;
-
-        __shared__ typename IteratorQ::Storage tb_smem_Q_storage;
-
-        auto tb_smem_Q = tb_smem_Q_storage.data[slice_id];
-
-        IteratorA iter_A{A, tb_smem_A, M, K, cta_m, cta_k, warp_id_mn, lane_id};
-        IteratorQ iter_Q{Q, tb_smem_Q, M, K, cta_m, cta_k, warp_id_mn, lane_id};
-        IteratorB iter_B{B, tb_smem_B, K, N, cta_n, cta_k, warp_id_mn, lane_id};
-
-        const int offset_m = warp_id_m * WARP_M + lane_id;
-
-        WarpIterA warp_iter_A(iter_A.smem_, iter_Q.smem_, warp_id, lane_id, offset_m, cta_k);
-        WarpIterB warp_iter_B(iter_B.smem_int_ptr_, warp_id_n, lane_id, 0);
-
-        int gemm_iter = (K + CTA_K - 1) / CTA_K;
-
-        PRAGMA_UNROLL
-        for (int stage = 0; stage < STAGES - 1; ++stage, --gemm_iter) {
-            iter_A.prefetch_stage(gemm_iter > 0);
-            iter_Q.prefetch_stage(gemm_iter > 0);
-            iter_B.prefetch_stage(gemm_iter > 0);
-            __pipeline_commit();
-        }
-
-        clear(tb_frag_C);
-
-        __pipeline_wait_prior(STAGES - 2);
-        sync_slice(slice_id);
-
-        warp_iter_A.load(warp_frag_A_[0], 0);
-        warp_iter_B.load(warp_frag_B_[0], 0);
-
-        PRAGMA_NO_UNROLL
-        for (; gemm_iter > -STAGES + 1;) {
-            warp_mma(iter_A, iter_Q, iter_B, warp_iter_A, warp_iter_B, tb_frag_C, slice_id, gemm_iter);
-        }
-
-        __pipeline_commit();
-        __pipeline_wait_prior(0);
-        __syncthreads();
-
-        if constexpr (SLICES > 1) {
-            sum_slices(tb_frag_C, tb_smem_C, warp_id_m, warp_id_n, lane_id, slice_id);
-        }
-
-        switch (output_op_idx) {
-            case 0:
-                store_accum<0>(tb_frag_C, tb_smem_C, C, M, N, cta_m, cta_n, warp_id_m, warp_id_n, lane_id, slice_id);
-                break;
-            case 1:
-                store_accum<1>(tb_frag_C, tb_smem_C, C, M, N, cta_m, cta_n, warp_id_m, warp_id_n, lane_id, slice_id);
-                break;
-            case 2:
-                store_accum<2>(tb_frag_C, tb_smem_C, C, M, N, cta_m, cta_n, warp_id_m, warp_id_n, lane_id, slice_id);
-                break;
-            default:
-                return;
-        }
-    }
-};
-
-template<typename Gemm>
-__global__ void gemm_s4_f16_nn(half* __restrict__ C,
-                               const uint* __restrict__ A,
-                               const half* __restrict__ B,
-                               const half2* __restrict__ Q,
-                               int M,
-                               int N,
-                               int K,
-                               int output_op_idx)
-{
-#if __CUDA_ARCH__ >= 750
-    Gemm{}.run_v2(C, A, B, Q, M, N, K, output_op_idx);
-#endif
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/metric.h b/src/turbomind/kernels/gemm_s_f16/metric.h
deleted file mode 100644
index 69ef242cf8..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/metric.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include <array>
-#include <iomanip>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace turbomind {
-
-struct Metric {
-    int  id;
-    bool feasible;
-    bool prefer;
-
-    std::array<int, 3> cta_shape;
-    std::array<int, 3> warp_shape;
-
-    int   warps;
-    int   stages;
-    int   max_active_ctas;
-    float smem;
-
-    float cta_cnt_m;
-    float cta_cnt_n;
-    float cta_iter_k;
-    float grid_size;
-
-    int   active_ctas;
-    float waves;
-    float waves1;
-    float occupancy;
-
-    float tile_efficiency;
-    float wave_efficiency;
-
-    float grid_a0;
-    float grid_b0;
-    float grid_a1;
-    float grid_b1;
-    float grid_mm;
-
-    float grid_sum;
-    float grid_norm;
-
-    float cta_sum;
-    float cta_wave;
-
-    int   best;
-    float time;
-    int   count;
-};
-
-inline void DumpMetrics(std::ostream& os, const std::vector<Metric>& metrics, const std::vector<int>& indices = {})
-{
-    auto dump_shape = [](const std::array<int, 3>& shape) {
-        std::stringstream ss;
-        ss << std::setw(4) << shape[0] << std::setw(4) << shape[1] << std::setw(4) << shape[2];
-        return ss.str();
-    };
-
-    std::vector<std::tuple<std::string, int>> infos{
-        {"id", 4},       {"valid", 6},      {"cta_mnk", 14},   {"warp_mnk", 14},   {"warps", 6},     {"stages", 8},
-        {"smem", 8},     {"cta_cnt_m", 10}, {"cta_cnt_n", 10}, {"cta_iter_k", 11}, {"max_ctas", 9},  {"act_ctas", 10},
-        {"waves", 12},   {"waves1", 12},    {"occupancy", 12}, {"%tile", 10},      {"%wave", 10},    {"grid_a0", 12},
-        {"grid_b0", 12}, {"grid_a1", 12},   {"grid_b1", 12},   {"grid_mm", 12},    {"grid_sum", 12}, {"cta_cnt", 8},
-        {"cta_sum", 8},  {"cta_wave", 9},   {"grid_norm", 12}, {"time", 12},       {"best", 7}};
-
-    for (const auto& [name, width] : infos) {
-        os << std::setw(width) << name;
-    }
-    os << "\n";
-
-    for (size_t i = 0; i < metrics.size(); ++i) {
-        auto& metric = indices.empty() ? metrics[i] : metrics[indices[i]];
-        int   c      = 0;
-        os << std::setw(std::get<1>(infos[c++])) << metric.id;
-        os << std::setw(std::get<1>(infos[c++])) << metric.feasible;
-        os << std::setw(std::get<1>(infos[c++])) << dump_shape(metric.cta_shape);
-        os << std::setw(std::get<1>(infos[c++])) << dump_shape(metric.warp_shape);
-        os << std::setw(std::get<1>(infos[c++])) << metric.warps;
-        os << std::setw(std::get<1>(infos[c++])) << metric.stages;
-        os << std::setw(std::get<1>(infos[c++])) << metric.smem;
-        os << std::setw(std::get<1>(infos[c++])) << metric.cta_cnt_m;
-        os << std::setw(std::get<1>(infos[c++])) << metric.cta_cnt_n;
-        os << std::setw(std::get<1>(infos[c++])) << metric.cta_iter_k;
-        os << std::setw(std::get<1>(infos[c++])) << metric.max_active_ctas;
-        os << std::setw(std::get<1>(infos[c++])) << metric.active_ctas;
-        os << std::setw(std::get<1>(infos[c++])) << metric.waves;
-        os << std::setw(std::get<1>(infos[c++])) << metric.waves1;
-        os << std::setw(std::get<1>(infos[c++])) << metric.occupancy;
-        os << std::setw(std::get<1>(infos[c++])) << metric.tile_efficiency;
-        os << std::setw(std::get<1>(infos[c++])) << metric.wave_efficiency;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_a0;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_b0;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_a1;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_b1;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_mm;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_sum;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_size;
-        os << std::setw(std::get<1>(infos[c++])) << metric.cta_sum;
-        os << std::setw(std::get<1>(infos[c++])) << metric.cta_wave;
-        os << std::setw(std::get<1>(infos[c++])) << metric.grid_norm;
-        os << std::setw(std::get<1>(infos[c++])) << metric.time * 1000 / metric.count;
-        os << std::setw(std::get<1>(infos[c++])) << (metric.best ? "*" : "");
-        os << "\n";
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/gemm_s_f16/warp_iterator.h b/src/turbomind/kernels/gemm_s_f16/warp_iterator.h
deleted file mode 100644
index 853720a8f1..0000000000
--- a/src/turbomind/kernels/gemm_s_f16/warp_iterator.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "common.h"
-
-namespace turbomind {
-
-template<int CTA_M,
-         int CTA_K,
-         int WARP_M,
-         int WARP_K,
-         int OP_M,
-         int OP_K,
-         int GROUP_SIZE,
-         int STAGES,
-         int kSizePerStageA,
-         int kSizePerStageQ>
-struct WarpIteratorA {
-
-    static_assert(WARP_K % GROUP_SIZE == 0 || GROUP_SIZE % WARP_K == 0);
-
-    static constexpr int ITER_M = 32 / OP_M;
-    static constexpr int ITER_X = WARP_M / 32;
-
-    uint4 frag_A4_[ITER_X];    // 8 value per uint
-    half2 frag_Q_[ITER_X][4];  // 4 m8k8 tile along M, as WARP_M == 32
-
-    const uint4* smem_A_;
-    const half2* smem_Q_;
-    const int    offset_m_;
-    const int    offset_m_Q_;
-
-    int stage_{0};
-    int offset_A_{0};
-    int offset_Q_{0};
-
-    __device__ WarpIteratorA(uint4* smem_A, half2* smem_Q, int warp_id, int lane_id, int offset_m, int offset_k):
-        smem_A_(smem_A), smem_Q_(smem_Q), offset_m_(offset_m), offset_m_Q_(offset_m / 32 * 32 + lane_id / 4)
-    {
-    }
-
-    // iter_k must be a compile tile constant
-    __device__ void load(Array<half, 8>* data, int iter_k)
-    {
-        // load A
-        // smem_A uint4 [SLICE_K/32, CTA_M/32, WARP_SIZE], load as uint4 to avoid bank-conflicts
-        if (iter_k % 2 == 0) {
-            PRAGMA_UNROLL
-            for (int x = 0; x < ITER_X; ++x) {
-                frag_A4_[x] = smem_A_[offset_A_ + (iter_k / 2) * CTA_M + x * 32 + offset_m_];
-            }
-        }
-
-        // load Q
-        if (iter_k * OP_K % GROUP_SIZE == 0) {
-            const int g = iter_k * OP_K / GROUP_SIZE;
-            PRAGMA_UNROLL
-            for (int x = 0; x < ITER_X; ++x) {
-                PRAGMA_UNROLL
-                for (int i = 0; i < 4; ++i) {
-                    const int mm           = offset_m_Q_ + x * 32 + i * 8;  // stride of m8k8 tile
-                    ((uint&)frag_Q_[x][i]) = ((uint&)smem_Q_[offset_Q_ + g * CTA_M + mm]);
-                }
-            }
-        }
-
-        PRAGMA_UNROLL
-        for (int x = 0; x < ITER_X; ++x) {
-            const uint* frag_A = (uint*)&frag_A4_[x];
-            PRAGMA_UNROLL
-            for (int iter_m = 0; iter_m < ITER_M; ++iter_m) {
-                uint4 tmp = dequantize_s4_to_fp16x2_v2(frag_A[iter_k % 2 * 2 + iter_m]);
-                auto& vec = (Array<half2, 4>&)tmp;
-
-                vec[0] = apply_Q(vec[0], frag_Q_[x][iter_m * 2]);
-                vec[1] = apply_Q(vec[1], frag_Q_[x][iter_m * 2 + 1]);
-                vec[2] = apply_Q(vec[2], frag_Q_[x][iter_m * 2]);
-                vec[3] = apply_Q(vec[3], frag_Q_[x][iter_m * 2 + 1]);
-
-                data[x * ITER_M + iter_m] = (Array<half, 8>&)vec;
-            }
-        }
-    }
-
-    __device__ void next_stage()
-    {
-        ++stage_;
-        if (stage_ >= STAGES) {
-            stage_ = 0;
-        }
-        offset_A_ = stage_ * kSizePerStageA;
-        offset_Q_ = stage_ * kSizePerStageQ;
-    }
-};
-
-template<int CTA_N, int CTA_K, int WARP_N, int WARP_K, int OP_N, int OP_K, int SMEM_STRIDE, int STAGES>
-struct WarpIteratorB {
-
-    static constexpr int kLdsmNum = WARP_N == 8 ? 2 : 4;
-    static constexpr int ITER_N   = WARP_N / OP_N;
-    static constexpr int ITER_K   = WARP_K / OP_K;
-
-    static_assert(OP_N == 8 && OP_K == 16);
-
-    const int warp_id_n_;
-    const int lane_id_;
-
-    const int ldsm_group_id_;
-
-    const int offset_k_;
-    int       offset_n_;
-
-    const uint32_t smem_base_ptr_;
-
-    uint32_t smem_ptr_;
-
-    int stage_{0};
-
-    __device__ WarpIteratorB(uint32_t smem_int_ptr, int warp_id_n, int lane_id, int offset_k):
-        smem_base_ptr_(smem_int_ptr),
-        smem_ptr_(smem_base_ptr_),
-        warp_id_n_(warp_id_n),
-        lane_id_(lane_id),
-        ldsm_group_id_(lane_id / 8),
-        offset_k_(ldsm_group_id_ % 2 * 8 + offset_k),
-        offset_n_(ldsm_group_id_ / 2 * 8 + lane_id % 8)
-    {
-        if (kLdsmNum == 2) {
-            offset_n_ -= ldsm_group_id_ / 2 * 8;
-        }
-        offset_n_ += warp_id_n_ * WARP_N;
-    }
-
-    __device__ void load(Array<half, 4>* data, int iter_k)
-    {
-        const int kk  = iter_k * OP_K + offset_k_;
-        auto      ptr = (uint*)data;
-        PRAGMA_UNROLL
-        for (int iter_n = 0; iter_n < ITER_N;) {
-            const int nn  = offset_n_ + iter_n * OP_N;
-            auto      src = smem_ptr_ + sizeof(half) * (nn * SMEM_STRIDE + kk);
-            if constexpr (kLdsmNum == 4) {
-                ldmatrix_m8n8_x4_b16(ptr[0], ptr[1], ptr[2], ptr[3], src);
-                ptr += 4;
-                iter_n += 2;
-            }
-            else {
-                ldmatrix_m8n8_x2_b16(ptr[0], ptr[1], src);
-                ptr += 2;
-                iter_n += 1;
-            }
-        }
-    }
-
-    __device__ void next_stage()
-    {
-        ++stage_;
-        if (stage_ >= STAGES) {
-            stage_ = 0;
-        }
-        smem_ptr_ = smem_base_ptr_ + stage_ * sizeof(half) * CTA_N * SMEM_STRIDE;
-    }
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/kernels/unfused_attention_kernels.cu b/src/turbomind/kernels/unfused_attention_kernels.cu
index 6d95c789a1..7f733a6dfc 100644
--- a/src/turbomind/kernels/unfused_attention_kernels.cu
+++ b/src/turbomind/kernels/unfused_attention_kernels.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include "src/turbomind/kernels/attention/array_ops.h"
+#include "src/turbomind/kernels/attention/rotary_embedding.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 898a742528..77c26e9e51 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -8,6 +8,7 @@ find_package(CUDAToolkit REQUIRED)
 add_library(Llama STATIC
         LlamaV2.cc
         LlamaBatch.cc
+        LlamaLinear.cu
         BlockManager.cc
         BlockTrie.cc
         SequenceManager.cc
@@ -22,7 +23,7 @@ add_library(Llama STATIC
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
-        gemm_s4_f16
+        gemm2
         cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index a522c6c1ab..8aafbe4b7b 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -1,7 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include "src/turbomind/models/llama/LlamaBatch.h"
-#include "src/turbomind/kernels/attention/data_type.h"
+#include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/macro.h"
@@ -628,6 +628,8 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
     g.finished_count         = 0;
     g.skip_init_sampling     = skip_init_sampling;
 
+    // TM_LOG_ERROR("[Initialize] batch size: %d, active size: %d", state_->size, state_->active_size);
+
     if (!skip_init_sampling) {
         g.max_init_ctx_len = max_context_len;
         g.step             = max_context_len;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index d055248d5b..30cc363c41 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -19,10 +19,17 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/kernels/gemm/cast.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"
+#include <cstdlib>
+#include <cuda_runtime.h>
 #include <filesystem>
+#include <ios>
 
 namespace turbomind {
 
@@ -84,7 +91,9 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             }
         }
     }
-    fused_up_and_gate_ = weight_type_ == WeightType::kINT4 && ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
+    // fused_up_and_gate_ = weight_type_ == WeightType::kINT4 && ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
+
+    fused_up_and_gate_ = true && ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
     self_attn_weights.qkv.input_dims  = hidden_units_;
     self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
@@ -110,24 +119,49 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     ffn_weights.fused_gating_intermediate.output_dims = inter_size_ / tensor_para_size_ * 2;
     ffn_weights.fused_gating_intermediate.type        = weight_type;
     ffn_weights.fused_gating_intermediate.group_size  = group_size;
+    ffn_weights.is_fused_silu                         = weight_type == WeightType::kINT4;
 
     ffn_weights.output.input_dims  = inter_size_ / tensor_para_size_;
     ffn_weights.output.output_dims = hidden_units_;
     ffn_weights.output.type        = weight_type;
     ffn_weights.output.group_size  = group_size;
+
     mallocWeights();
 }
 
+template<typename T>
+size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
+{
+    if (weight_type_ != WeightType::kINT4) {
+        return 0;
+    }
+
+    auto get_size = [](const auto& w) { return (size_t)w.input_dims * w.output_dims; };
+
+    size_t size = 0;
+
+    size = std::max(size, get_size(self_attn_weights.qkv));
+    size = std::max(size, get_size(ffn_weights.gating));
+
+    if (fused_up_and_gate_) {
+        size = std::max(size, get_size(ffn_weights.fused_gating_intermediate));
+    }
+
+    return size * sizeof(uint16_t);
+}
+
 template<typename T>
 void freeWeights(LlamaDenseWeight<T>& weights)
 {
     cudaFree(weights.kernel);
     cudaFree(weights.bias);
-    cudaFree(weights.scales_and_zeros);
+    cudaFree(weights.scales);
+    cudaFree(weights.zeros);
 
-    weights.kernel           = nullptr;
-    weights.bias             = nullptr;
-    weights.scales_and_zeros = nullptr;
+    weights.kernel = nullptr;
+    weights.bias   = nullptr;
+    weights.scales = nullptr;
+    weights.zeros  = nullptr;
 
     {
         cudaFree(weights.lora.a);
@@ -152,8 +186,8 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
         FT_CHECK(weights.input_dims % factor == 0);
         deviceMalloc((int**)&weights.kernel, weights.input_dims * weights.output_dims / factor);
         deviceMemSetZero((int*)weights.kernel, weights.input_dims * weights.output_dims / factor);
-        // interleaved scales/zeros
-        deviceMalloc((T**)&weights.scales_and_zeros, weights.input_dims / weights.group_size * weights.output_dims * 2);
+        deviceMalloc((T**)&weights.scales, weights.input_dims / weights.group_size * weights.output_dims);
+        deviceMalloc((T**)&weights.zeros, weights.input_dims / weights.group_size * weights.output_dims);
     }
 
     if (weights.lora.r > 0) {
@@ -196,11 +230,16 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
                              TYPE_INT32,
                              {weights.input_dims * weights.output_dims * sizeof(int) / factor},
                              weights.kernel});
-        output.insert(get_name("scales_zeros"),
+        output.insert(get_name("scales"),
+                      Tensor{MEMORY_GPU,
+                             getTensorType<T>(),
+                             {weights.input_dims / weights.group_size * weights.output_dims * sizeof(T)},
+                             weights.scales});
+        output.insert(get_name("zeros"),
                       Tensor{MEMORY_GPU,
                              getTensorType<T>(),
-                             {weights.input_dims / weights.group_size * weights.output_dims * 2 * sizeof(T)},
-                             weights.scales_and_zeros});
+                             {weights.input_dims / weights.group_size * weights.output_dims * sizeof(T)},
+                             weights.zeros});
     }
 
     if (weights.lora.r) {
@@ -326,7 +365,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
 
         const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
 
-        loadWeightFromBin((half*)w.scales_and_zeros, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
+        loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type, {});
+        loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type, {});
     }
 }
 
@@ -339,14 +379,8 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
     turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
     turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
 
-    if (fused_up_and_gate_) {
-        turbomind::mallocWeights(ffn_weights.fused_gating_intermediate, false);
-    }
-    else {
-        turbomind::mallocWeights(ffn_weights.gating, false);
-        turbomind::mallocWeights(ffn_weights.intermediate, false);
-    }
-
+    turbomind::mallocWeights(ffn_weights.gating, false);
+    turbomind::mallocWeights(ffn_weights.intermediate, false);
     turbomind::mallocWeights(ffn_weights.output, false);
 }
 
@@ -355,18 +389,15 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
 {
     cudaFree((void*)self_attn_norm_weights);
     cudaFree((void*)ffn_norm_weights);
+    self_attn_norm_weights = nullptr;
+    ffn_norm_weights       = nullptr;
 
     freeWeights(self_attn_weights.qkv);
     freeWeights(self_attn_weights.output);
 
-    if (fused_up_and_gate_) {
-        freeWeights(ffn_weights.fused_gating_intermediate);
-    }
-    else {
-        freeWeights(ffn_weights.gating);
-        freeWeights(ffn_weights.intermediate);
-    }
-
+    freeWeights(ffn_weights.fused_gating_intermediate);
+    freeWeights(ffn_weights.gating);
+    freeWeights(ffn_weights.intermediate);
     freeWeights(ffn_weights.output);
 }
 
@@ -390,19 +421,18 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
 
     loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_, 0);
 
-    if (fused_up_and_gate_) {
-        loadWeights(ffn_weights.fused_gating_intermediate,
-                    dir_path + ".feed_forward.w13",
-                    tensor_para_rank_,
-                    type,
-                    tensor_para_size_,
-                    1);
-    }
-    else {
-        loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_, 1);
-        loadWeights(
-            ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_, 1);
-    }
+    // if (fused_up_and_gate_) {
+    //     loadWeights(ffn_weights.fused_gating_intermediate,
+    //                 dir_path + ".feed_forward.w13",
+    //                 tensor_para_rank_,
+    //                 type,
+    //                 tensor_para_size_,
+    //                 1);
+    // }
+    // else {
+    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_, 1);
+    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_, 1);
+    // }
     loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_, 0);
 }
 
@@ -420,19 +450,233 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
     auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
 
     getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
-
     getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
 
+    getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
+    getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
+    getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
+
+    return output;
+}
+
+template<class T>
+static void convert(LlamaDenseWeight<T>& weight, void* workspace, size_t size, bool use_simt)
+{
+    if (weight.type != WeightType::kINT4) {
+        return;
+    }
+
+    using namespace gemm;
+
+    auto [order_b, pack_b, order_v, pack_v] = get_weight_and_scales_layout(getSMVersion(), use_simt);
+
+    if (order_b == kColMajor) {
+        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims);
+        cudaMemcpy(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault);
+    }
+
+    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims);
+    sync_check_cuda_error();
+
+    if constexpr (0) {
+        std::vector<uint16_t> tmp(weight.input_dims * weight.output_dims);
+        cudaMemcpy(tmp.data(), workspace, sizeof(uint16_t) * tmp.size(), cudaMemcpyDefault);
+        cudaDeviceSynchronize();
+        int i = 0;
+        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+            std::cout << *it << " ";
+        }
+        i = 0;
+        std::cout << "\n";
+        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+            std::cout << *it << " ";
+        }
+    }
+
+    MatrixLayout w_desc{
+        gemm::DataType::F16,
+        order_b,
+        (int)weight.input_dims,   // k
+        (int)weight.output_dims,  // n
+        order_b == kRowMajor ? (int)weight.output_dims : (int)weight.input_dims,
+    };
+
+    MatrixLayout k_desc = w_desc;
+    k_desc.type         = gemm::DataType::U4;
+    k_desc.pack         = pack_b;
+
+    cudaMemset(weight.kernel, 0, weight.input_dims * weight.output_dims / 2);
+
+    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, 0) == 0);
+    sync_check_cuda_error();
+
+    if constexpr (0) {
+        std::vector<uint32_t> tmp(weight.input_dims * weight.output_dims / 8);
+        cudaMemcpy(tmp.data(), weight.kernel, sizeof(uint32_t) * tmp.size(), cudaMemcpyDefault);
+        cudaDeviceSynchronize();
+        int i = 0;
+        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+            std::cout << std::hex << *it << " ";
+        }
+        i = 0;
+        std::cout << "\n";
+        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+            std::cout << std::hex << *it << " ";
+        }
+    }
+
+    const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
+
+    if constexpr (std::is_same_v<T, half>) {
+        // std::cout << "fuse_scales_and_zeros\n";
+        fuse_scales_and_zeros((T*)workspace, weight.scales, weight.zeros, scale_count);
+        // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
+        sync_check_cuda_error();
+    }
+
+    cudaDeviceSynchronize();
+
+    cudaFree(weight.scales);
+    cudaFree(weight.zeros);
+    weight.scales = weight.zeros = nullptr;
+
+    deviceMalloc((T**)&weight.scales_zeros, scale_count * 2);
+
+    MatrixLayout s_desc{
+        gemm::DataType::U32,
+        order_v,
+        (int)weight.input_dims / weight.group_size,  // k
+        (int)weight.output_dims,                     // n
+        (int)weight.output_dims,
+    };
+
+    MatrixLayout q_desc = s_desc;
+    q_desc.pack         = pack_v;
+
+    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, 0) == 0);
+    sync_check_cuda_error();
+
+    if constexpr (0) {
+        std::vector<T> tmp(scale_count * 2);
+        cudaMemcpy(tmp.data(), weight.scales_zeros, sizeof(T) * tmp.size(), cudaMemcpyDefault);
+        cudaDeviceSynchronize();
+        // for (const auto& x: tmp) {
+        //     std::cout << (float)x << " ";
+        // }
+        int i = 0;
+        for (auto it = tmp.begin(); i < 1000 && it != tmp.end(); ++it, ++i) {
+            std::cout << std::hex << *it << " ";
+        }
+        i = 0;
+        std::cout << "\n";
+        for (auto it = tmp.rbegin(); i < 1000 && it != tmp.rend(); ++it, ++i) {
+            std::cout << std::hex << *it << " ";
+        }
+    }
+
+    weight.k_desc = k_desc;
+    weight.q_desc = q_desc;
+
+    // FT_CHECK(0);
+}
+
+template<class T>
+void interleave(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void* workspace, size_t size)
+{
+    FT_CHECK(c.input_dims == a.input_dims);
+    FT_CHECK(c.input_dims == b.input_dims);
+    FT_CHECK(c.output_dims == a.output_dims * 2);
+    FT_CHECK(c.output_dims == b.output_dims * 2);
+    FT_CHECK(c.group_size == a.group_size);
+    FT_CHECK(c.group_size == b.group_size);
+
+    if (a.type == WeightType::kINT4) {
+        uint8_t* tmp_a = (uint8_t*)workspace;
+        uint8_t* tmp_b = tmp_a + a.output_dims * a.input_dims;
+        uint8_t* tmp_c = tmp_b + b.output_dims * b.input_dims;
+
+        const auto sentinel = tmp_c + c.output_dims * c.input_dims;
+        FT_CHECK(sentinel <= (uint8_t*)workspace + size);
+
+        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims);
+        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims);
+
+        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, 0);
+
+        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims);
+
+        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, 0);
+        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, 0);
+    }
+    else {
+        FT_CHECK_WITH_INFO(0, "not implemented");
+        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, 0);
+    }
+
+    // Check at function level
+    sync_check_cuda_error();
+}
+
+template<class T>
+void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t)
+{
+    FT_CHECK(c.input_dims == a.input_dims);
+    FT_CHECK(c.input_dims == b.input_dims);
+    FT_CHECK(c.output_dims == a.output_dims * 2);
+    FT_CHECK(c.output_dims == b.output_dims * 2);
+    FT_CHECK(c.group_size == a.group_size);
+    FT_CHECK(c.group_size == b.group_size);
+
+    auto _chunks = [](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(cudaMemcpy2D((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault));
+        check_cuda_error(cudaMemcpy2D((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault));
+    };
+
+    if (c.type == WeightType::kINT4) {
+        _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, 4 * a.output_dims / 8);
+        _chunks(c.scales, a.scales, b.scales, a.input_dims / a.group_size, sizeof(T) * a.output_dims);
+        _chunks(c.zeros, a.zeros, b.zeros, a.input_dims / a.group_size, sizeof(T) * a.output_dims);
+    }
+    else {
+        _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, sizeof(T) * a.output_dims);
+    }
+
+    // Check at function level
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop)
+{
+    const bool is_16xx = is_16xx_series(prop.name);
+
+    convert(self_attn_weights.qkv, workspace, size, is_16xx);
+    convert(self_attn_weights.output, workspace, size, is_16xx);
+
     if (fused_up_and_gate_) {
-        getWeightTensor(ffn_weights.fused_gating_intermediate, false, get_prefix("feed_forward.w13"), output);
+
+        auto& fused_up_and_gate = ffn_weights.fused_gating_intermediate;
+
+        turbomind::mallocWeights(fused_up_and_gate, false);
+
+        if (ffn_weights.is_fused_silu) {
+            interleave(fused_up_and_gate, ffn_weights.gating, ffn_weights.intermediate, workspace, size);
+        }
+        else {
+            chunk(fused_up_and_gate, ffn_weights.gating, ffn_weights.intermediate, workspace, size);
+        }
+
+        convert(ffn_weights.fused_gating_intermediate, workspace, size, is_16xx);
+
+        freeWeights(ffn_weights.gating);
+        freeWeights(ffn_weights.intermediate);
     }
     else {
-        getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
-        getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
+        convert(ffn_weights.gating, workspace, size, is_16xx);
+        convert(ffn_weights.intermediate, workspace, size, is_16xx);
     }
-    getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
 
-    return output;
+    convert(ffn_weights.output, workspace, size, is_16xx);
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 5086adf8ee..05600d0f56 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -49,6 +49,10 @@ struct LlamaDecoderLayerWeight {
 
     TensorMap getParams(std::string prefix);
 
+    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop);
+
+    size_t workspace_size() const noexcept;
+
     T*                      self_attn_norm_weights{};
     T*                      ffn_norm_weights{};
     LlamaAttentionWeight<T> self_attn_weights{};
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index 2201852346..a975da1a0d 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
@@ -82,8 +83,13 @@ struct LlamaDenseWeight {
     LoraWeight lora;
     WeightType type;
     T*         bias;
-    T*         scales_and_zeros;
+    T*         scales;
+    T*         zeros;
+    T*         scales_zeros;
     int        group_size;
+
+    gemm::MatrixLayout k_desc;
+    gemm::MatrixLayout q_desc;
 };
 
 template<typename T>
@@ -98,6 +104,7 @@ struct LlamaFfnWeight {
     LlamaDenseWeight<T> intermediate;
     LlamaDenseWeight<T> output;
     LlamaDenseWeight<T> fused_gating_intermediate;
+    bool                is_fused_silu;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 464e9a2cee..974b340ec1 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -31,11 +31,19 @@ void LlamaFfnLayer<T>::allocateBuffer(size_t                     token_num,
                                       const LlamaDenseWeight<T>* gating,
                                       const LlamaDenseWeight<T>* inter)
 {
-    size_t sz           = sizeof(T) * token_num * inter_size_;
-    size_t sz_gate      = (gating->lora.r > 0) ? sz + sz / inter_size_ * gating->lora.r : sz;
-    size_t sz_inter     = (inter->lora.r > 0) ? sz + sz / inter_size_ * inter->lora.r : sz;
-    inter_buf_          = (T*)allocator_->reMalloc(inter_buf_, sz_inter, false);
-    gating_buf_         = (T*)allocator_->reMalloc(gating_buf_, sz_gate, false);
+    const size_t sz = token_num * inter_size_;
+
+    const size_t sz_gate  = token_num * gating->lora.r;
+    const size_t sz_inter = token_num * inter->lora.r;
+
+    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * (sz * 2 + sz_gate + sz_inter), false);
+    inter_buf_  = gating_buf_ + sz;
+
+    // gate & inter is not fused when lora is enabled
+    if (gating->lora.r) {
+        inter_buf_ += sz_gate;
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -43,31 +51,26 @@ template<typename T>
 void LlamaFfnLayer<T>::freeBuffer()
 {
     if (is_allocate_buffer_) {
-        allocator_->free((void**)&inter_buf_);
+        // allocator_->free((void**)&inter_buf_);
         allocator_->free((void**)&gating_buf_);
         is_allocate_buffer_ = false;
     }
 }
 
 template<typename T>
-void LlamaFfnLayer<T>::activation(int num_token)
+void LlamaFfnLayer<T>::activation(int token_num, bool is_chunked)
 {
     NvtxScope scope("activation");
-    invokeGenericActivation<SiluActivation>(gating_buf_,
-                                            (const T*)nullptr,  // bias
-                                            inter_buf_,
-                                            (const T*)nullptr,  // gated_bias
-                                            nullptr,            // ia3_tasks
-                                            (const T*)nullptr,  // ia3_weights
-                                            num_token,          // m
-                                            inter_size_,        // n
-                                            0,                  // int8_mode
-                                            nullptr,            // activation_in
-                                            nullptr,            // activation_out
-                                            nullptr,            // padding_offset
-                                            0,                  // seq_len
-                                            stream_);
-    sync_check_cuda_error();
+    if (is_chunked) {
+        invokeGenericActivation_v2<SiluActivation>(
+            gating_buf_, gating_buf_ + inter_size_, inter_size_ * 2, token_num, inter_size_, stream_);
+        sync_check_cuda_error();
+    }
+    else {
+        invokeGenericActivation_v2<SiluActivation>(
+            gating_buf_, inter_buf_, inter_size_, token_num, inter_size_, stream_);
+        sync_check_cuda_error();
+    }
 }
 
 template<typename T>
@@ -97,8 +100,15 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
     if (weights->fused_gating_intermediate.kernel) {
         NvtxScope scope("fused_silu_ffn");
-        linear_.forward(
-            gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, LlamaLinear<T>::kFusedSiluFfn);
+
+        const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
+
+        linear_.forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        sync_check_cuda_error();
+
+        if (!weights->is_fused_silu) {
+            activation(num_token, true);
+        }
 
         count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
@@ -106,6 +116,7 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
         {  // w1(x)
             NvtxScope scope("w1");
             linear_.forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            sync_check_cuda_error();
         }
         count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
@@ -113,18 +124,22 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
             NvtxScope scope("w3");
             linear_.forward(
                 inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
+            sync_check_cuda_error();
         }
         count_and_fix(inter_buf_, num_token * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
 
         // silu(w1(x)) * w3(x)
-        activation(num_token);
+        activation(num_token, false);
 
         count_and_fix(gating_buf_, num_token * weights->output.input_dims, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
+        linear_.forward(
+            ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+        sync_check_cuda_error();
     }
 
     count_and_fix(ffn_output_data, num_token * weights->output.output_dims, Concat("w2", layer_id), 3);
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 6a414305cc..97465ad6d1 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -31,20 +31,20 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(size_t           head_num,
-                  size_t           size_per_head,
-                  size_t           inter_size,
-                  NcclParam        tensor_para,
-                  cudaStream_t     stream,
-                  cublasMMWrapper* cublas_wrapper,
-                  IAllocator*      allocator,
-                  bool             is_free_buffer_after_forward):
+    LlamaFfnLayer(size_t         head_num,
+                  size_t         size_per_head,
+                  size_t         inter_size,
+                  NcclParam      tensor_para,
+                  cudaStream_t   stream,
+                  LlamaLinear<T> linear,
+                  IAllocator*    allocator,
+                  bool           is_free_buffer_after_forward):
         head_num_(head_num),
         size_per_head_(size_per_head),
         inter_size_(inter_size / tensor_para.world_size_),
         hidden_units_(head_num * size_per_head),
         stream_(stream),
-        linear_(cublas_wrapper, stream),
+        linear_(linear),
         allocator_(allocator),
         tensor_para_(tensor_para),
         is_free_buffer_after_forward_(is_free_buffer_after_forward)
@@ -63,7 +63,7 @@ class LlamaFfnLayer {
 
     void freeBuffer();
 
-    void activation(int num_token);
+    void activation(int token_num, bool is_chunked);
 
     size_t         head_num_;
     size_t         size_per_head_;
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
new file mode 100644
index 0000000000..5b9f743174
--- /dev/null
+++ b/src/turbomind/models/llama/LlamaLinear.cu
@@ -0,0 +1,221 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/llama_decoder_kernels.h"
+#include <fstream>
+
+namespace turbomind {
+
+template<class T>
+struct LlamaLinear<T>::Impl {
+
+    Impl(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
+    {
+        workspace_ = {};
+
+        workspace_.barriers_size = gemm::Gemm::kBarriersSize;
+        workspace_.partials_size = gemm::Gemm::kPartialsSize;
+        cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_);
+        cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_);
+        cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_);
+    }
+
+    ~Impl()
+    {
+        cudaFreeAsync(workspace_.barriers, stream_);
+        cudaFreeAsync(workspace_.partials, stream_);
+        workspace_ = {};
+    }
+
+    void forward(T*                         output_data,
+                 Pitched                    input_data,
+                 int                        batch_size,
+                 const LlamaDenseWeight<T>& weight,
+                 Type                       type      = kGemm,
+                 int*                       lora_mask = nullptr)
+    {
+        if (input_data.pitch == 0) {
+            input_data.pitch = weight.input_dims;
+        }
+        if (lora_mask != nullptr && weight.lora.r > 0) {
+            FT_CHECK(type == kGemm);
+            // output = lora(x) * scale
+            // output = mask(output)
+            // output = x*W + output
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  weight.lora.r,                                  // m
+                                  batch_size,                                     // n
+                                  weight.input_dims,                              // k
+                                  (const T*)weight.lora.a,                        // A
+                                  weight.lora.r,                                  // lda
+                                  input_data.ptr,                                 // B
+                                  input_data.pitch,                               // ldb
+                                  output_data + batch_size * weight.output_dims,  // C
+                                  weight.lora.r);                                 // ldc
+
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  weight.output_dims,                             // m
+                                  batch_size,                                     // n
+                                  weight.lora.r,                                  // k
+                                  (const T*)weight.lora.b,                        // A
+                                  weight.output_dims,                             // lda
+                                  output_data + batch_size * weight.output_dims,  // B
+                                  weight.lora.r,                                  // ldb
+                                  output_data,                                    // C
+                                  weight.output_dims,                             // ldc
+                                  weight.lora.scale,                              // alpha
+                                  0.0f);                                          // beta
+
+            invokeMask(output_data, lora_mask, batch_size, weight.output_dims, stream_);
+            sync_check_cuda_error();
+
+            type = kFusedAdd;
+        }
+        switch (weight.type) {
+            case WeightType::kFP16:
+            case WeightType::kFP32:
+            case WeightType::kBF16:
+                return forwardFp(output_data, input_data, batch_size, weight, type);
+            case WeightType::kINT4:
+                return forwardInt4(output_data, input_data, batch_size, weight, type);
+            default:
+                FT_CHECK(0);
+        }
+    }
+
+    void forwardFp(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
+    {
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              weight.output_dims,
+                              batch_size,
+                              weight.input_dims,
+                              (const T*)weight.kernel,
+                              weight.output_dims,
+                              input_data.ptr,
+                              input_data.pitch,
+                              output_data,
+                              weight.output_dims,
+                              1.0f,
+                              type == kFusedAdd ? 1.0f : 0.0f);
+        sync_check_cuda_error();
+    }
+
+    void forwardInt4(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
+    {
+        using namespace gemm;
+
+        const Operation operation{dispatch_policy_,
+                                  type == kFusedSiluFfn ? Epilogue::kGatedSilu : Epilogue::kNone,
+                                  {QuantType::kNone},
+                                  {QuantType::kDefault, weight.group_size},
+                                  0};
+
+        const MatrixLayout a_desc{
+            get_data_type_v<T>,
+            kRowMajor,
+            batch_size,
+            (int)weight.input_dims,
+            input_data.pitch,
+        };
+
+        const MatrixLayout c_desc{
+            get_data_type_v<T>,
+            kRowMajor,
+            batch_size,
+            (int)weight.output_dims,
+            type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims,
+        };
+
+        auto ec = gemm_.Run(operation,
+                            1.f,
+                            input_data.ptr,
+                            a_desc,
+                            nullptr,
+                            {},
+                            weight.kernel,
+                            weight.k_desc,
+                            weight.scales_zeros,
+                            weight.q_desc,
+                            type == kFusedAdd ? 1.0f : 0.0f,
+                            output_data,
+                            c_desc,
+                            output_data,
+                            c_desc,
+                            workspace_,
+                            stream_);
+
+        if (ec) {
+            TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
+            // std::abort();
+        }
+    }
+
+    cublasMMWrapper*     cublas_wrapper_;
+    gemm::Gemm           gemm_;
+    gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
+    cudaStream_t         stream_{};
+
+    gemm::Workspace workspace_;
+};
+
+template<class T>
+LlamaLinear<T>::LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream):
+    impl_{std::make_shared<Impl>(cublas_wrapper, stream)}
+{
+}
+
+template<class T>
+void LlamaLinear<T>::forward(
+    T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type, int* lora_mask)
+{
+    impl_->forward(output_data, input_data, batch_size, weight, type, lora_mask);
+}
+
+template<class T>
+void LlamaLinear<T>::set_measure(bool measure)
+{
+    impl_->dispatch_policy_ = measure ? gemm::DispatchPolicy::kMeasure : gemm::DispatchPolicy::kReuse;
+}
+
+template<class T>
+int LlamaLinear<T>::Export(std::ostream& os)
+{
+    if (os) {
+        return impl_->gemm_.Export(os);
+    }
+    return 0;
+}
+
+template<class T>
+int LlamaLinear<T>::Import(std::istream& is)
+{
+    auto n_records = 0;
+    if (is) {
+        n_records = impl_->gemm_.Import(is);
+    }
+    if (n_records) {
+        impl_->dispatch_policy_ = gemm::DispatchPolicy::kReuse;
+    };
+    return n_records;
+}
+
+template<class T>
+std::vector<int> LlamaLinear<T>::GetTuningSeq() const
+{
+    return impl_->gemm_.GetTuningSeq();
+}
+
+#ifdef ENABLE_FP32
+template class LlamaLinear<float>;
+#endif
+template class LlamaLinear<half>;
+#ifdef ENABLE_BF16
+template class LlamaLinear<__nv_bfloat16>;
+#endif
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h
index b00fb58c55..938188f4bc 100644
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -2,15 +2,10 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
-#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/logger.h"
-#include <type_traits>
+#include <istream>
+#include <ostream>
 
 namespace turbomind {
 
@@ -24,115 +19,32 @@ class LlamaLinear {
         kFusedAdd
     };
 
-    LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
-    {
-    }
+    struct Pitched {
+        const T* ptr;
+        int      pitch;
+        Pitched(const T* ptr, int pitch = 0): ptr{ptr}, pitch{pitch} {}
+    };
+
+    LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream);
 
     void forward(T*                         output_data,
-                 const T*                   input_data,
+                 Pitched                    input_data,
                  int                        batch_size,
                  const LlamaDenseWeight<T>& weight,
                  Type                       type      = kGemm,
-                 int*                       lora_mask = nullptr)
-    {
-        if (lora_mask != nullptr && weight.lora.r > 0) {
-            FT_CHECK(type == kGemm);
-            // output = lora(x) * scale
-            // output = mask(output)
-            // output = x*W + output
-            cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                                  CUBLAS_OP_N,
-                                  weight.lora.r,                                  // m
-                                  batch_size,                                     // n
-                                  weight.input_dims,                              // k
-                                  (const T*)weight.lora.a,                        // A
-                                  weight.lora.r,                                  // lda
-                                  input_data,                                     // B
-                                  weight.input_dims,                              // ldb
-                                  output_data + batch_size * weight.output_dims,  // C
-                                  weight.lora.r);                                 // ldc
+                 int*                       lora_mask = nullptr);
 
-            cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                                  CUBLAS_OP_N,
-                                  weight.output_dims,                             // m
-                                  batch_size,                                     // n
-                                  weight.lora.r,                                  // k
-                                  (const T*)weight.lora.b,                        // A
-                                  weight.output_dims,                             // lda
-                                  output_data + batch_size * weight.output_dims,  // B
-                                  weight.lora.r,                                  // ldb
-                                  output_data,                                    // C
-                                  weight.output_dims,                             // ldc
-                                  weight.lora.scale,                              // alpha
-                                  0.0f);                                          // beta
+    void set_measure(bool measure);
 
-            invokeMask(output_data, lora_mask, batch_size, weight.output_dims, stream_);
-            type = kFusedAdd;
-        }
-        switch (weight.type) {
-            case WeightType::kFP16:
-            case WeightType::kFP32:
-            case WeightType::kBF16:
-                forwardFp(output_data, input_data, batch_size, weight, type);
-                break;
-            case WeightType::kINT4:
-                forwardInt4(output_data, input_data, batch_size, weight, type);
-                break;
-                break;
-            default:
-                FT_CHECK(0);
-        }
-    }
+    [[maybe_unused]] int Export(std::ostream& os);
 
-private:
-    void forwardFp(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
-    {
-        cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                              CUBLAS_OP_N,
-                              weight.output_dims,
-                              batch_size,
-                              weight.input_dims,
-                              (const T*)weight.kernel,
-                              weight.output_dims,
-                              input_data,
-                              weight.input_dims,
-                              output_data,
-                              weight.output_dims,
-                              1.0f,
-                              type == kFusedAdd ? 1.0f : 0.0f);
-        sync_check_cuda_error();
-    }
+    [[maybe_unused]] int Import(std::istream& is);
 
-    void forwardInt4(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
-    {
-        GemmS4F16::Type gemm_type = GemmS4F16::kGemm;
-        if (type == kFusedAdd)
-            gemm_type = GemmS4F16::kFusedAdd;
-        if (type == kFusedSiluFfn)
-            gemm_type = GemmS4F16::kFusedSiluFfn;
-        if constexpr (std::is_same_v<T, half>) {
-            gemm_s4_f16_.Run(output_data,
-                             (const uint*)weight.kernel,
-                             input_data,
-                             (const half2*)weight.scales_and_zeros,
-                             weight.output_dims,
-                             batch_size,
-                             weight.input_dims,
-                             weight.group_size,
-                             gemm_type,
-                             -1,
-                             stream_);
-            sync_check_cuda_error();
-        }
-        else {
-            FT_CHECK_WITH_INFO(0, "Not implemented");
-        }
-    }
+    std::vector<int> GetTuningSeq() const;
 
 private:
-    cublasMMWrapper* cublas_wrapper_;
-    cudaStream_t     stream_{};
-    GemmS4F16        gemm_s4_f16_;
+    struct Impl;
+    std::shared_ptr<Impl> impl_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 0387338c1b..f9f7922ff6 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -22,9 +22,11 @@
 
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
+#include "src/turbomind/kernels/gemm/tuner/params.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaBatch.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/Request.h"
@@ -37,8 +39,13 @@
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include <algorithm>
+#include <chrono>
+#include <exception>
 #include <functional>
 #include <memory>
+#include <ratio>
 #include <sstream>
 
 namespace turbomind {
@@ -90,6 +97,7 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
     is_free_buffer_after_forward_(is_free_buffer_after_forward),
     cuda_device_prop_(cuda_device_prop),
     debug_(isDebug()),
+    linear_{cublas_wrapper, stream},
     lora_params_(lora_params),
     shared_state_(shared_state)
 
@@ -135,7 +143,7 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
                                                  rmsnorm_eps_,
                                                  tensor_para_,
                                                  stream_,
-                                                 cublas_wrapper_,
+                                                 linear_,
                                                  allocator_,
                                                  lora_params_,
                                                  is_free_buffer_after_forward_,
@@ -552,6 +560,105 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
     }
 }
 
+template<class First, class Last>
+static std::string Join(First first, Last last, const std::string& delim)
+{
+    if (first == last) {
+        return {};
+    }
+    std::ostringstream oss;
+    oss << *first++;
+    while (first != last) {
+        oss << delim << *first++;
+    }
+    return oss.str();
+}
+
+// Only called when `weight_type == INT4` for now
+template<typename T>
+void LlamaV2<T>::tune()
+{
+
+    if (auto str = std::getenv("TM_GEMM_IMPORT")) {
+        std::ifstream ifs(str);
+        const int     n_imported = linear_.Import(ifs);
+        TM_LOG_INFO("[Gemm2] %d records imported", n_imported);
+        return;
+    }
+
+    std::vector<int> bss = linear_.GetTuningSeq();
+    if (bss.empty()) {
+        bss = gemm::GenerateTuningSequence(gemm::GetDefaultTuningGenerators());
+    }
+
+    {
+        auto str = Join(bss.begin(), bss.end(), ", ");
+        TM_LOG_INFO("[Gemm2] Tuning sequence: %s", str.c_str());
+    }
+
+    LlamaAttentionWeight<T>& attn = weights_->decoder_layer_weights[0]->self_attn_weights;
+    LlamaFfnWeight<T>&       ffn  = weights_->decoder_layer_weights[0]->ffn_weights;
+
+    std::vector<LlamaDenseWeight<T>*> weights{&attn.qkv, &attn.output, &ffn.output};
+
+    for (auto& layer : weights_->decoder_layer_weights) {
+        if (layer->ffn_weights.gating.kernel) {
+            weights.push_back(&layer->ffn_weights.gating);
+            break;
+        }
+    }
+    for (auto& layer : weights_->decoder_layer_weights) {
+        if (layer->ffn_weights.fused_gating_intermediate.kernel) {
+            weights.push_back(&layer->ffn_weights.fused_gating_intermediate);
+            break;
+        }
+    }
+
+    const int max_bs  = *std::max_element(bss.begin(), bss.end());
+    int       max_in  = 0;
+    int       max_out = 0;
+    for (auto& w : weights) {
+        max_in  = std::max<int>(max_in, w->input_dims);
+        max_out = std::max<int>(max_out, w->output_dims);
+    }
+
+    T* in_data  = (T*)allocator_->malloc(sizeof(T) * (size_t)max_bs * max_in);
+    T* out_data = (T*)allocator_->malloc(sizeof(T) * (size_t)max_bs * max_out);
+
+    cudaRandomUniform(in_data, (size_t)max_bs * max_in);
+    cudaDeviceSynchronize();
+
+    linear_.set_measure(true);
+
+    auto tick = std::chrono::steady_clock::now();
+
+    for (auto bs : bss) {
+        TM_LOG_INFO("[Gemm2] %d", bs);
+        for (auto& w : weights) {
+            linear_.forward(out_data, in_data, bs, *w);
+        }
+    }
+
+    auto tock = std::chrono::steady_clock::now();
+
+    TM_LOG_INFO("[Gemm2] Tuning finished in %.2f seconds.",
+                std::chrono::duration<float, std::ratio<1, 1>>(tock - tick).count());
+
+    linear_.set_measure(false);
+
+    allocator_->free((void**)&in_data);
+    allocator_->free((void**)&out_data);
+
+    // Only rank-0 exports the dispatch cache
+    if (tensor_para_.rank_ == 0) {
+        if (auto path = std::getenv("TM_GEMM_EXPORT")) {
+            std::ofstream ofs(path);
+            const auto    n_records = linear_.Export(ofs);
+            TM_LOG_INFO("[Gemm2] %d records exported.", n_records);
+        }
+    }
+}
+
 template class LlamaV2<half>;
 #ifdef ENABLE_FP32
 template class LlamaV2<float>;
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 61d83b90e0..adf6c4f9d4 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -88,6 +88,8 @@ class LlamaV2 {
                  const std::unordered_map<std::string, Tensor>* inputs,
                  Control                                        control);
 
+    void tune();
+
     void stop(const std::vector<uint64_t>& seq_ids);
 
     size_t vocab_size() const noexcept
@@ -188,6 +190,8 @@ class LlamaV2 {
 
     LlamaWeight<T>* weights_{};
 
+    LlamaLinear<T> linear_;
+
     std::unique_ptr<UnifiedDecoder<T>> unified_decoder_;
     DynamicDecodeLayer<float>*         dynamic_decode_layer_{};
 
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index c87bc40899..507f1a6f32 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -19,6 +19,8 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
 
 #include "src/turbomind/models/llama/LlamaWeight.h"
+#include "src/turbomind/utils/memory_utils.h"
+#include <cuda_runtime.h>
 
 namespace turbomind {
 
@@ -147,6 +149,23 @@ TensorMap LlamaWeight<T>::getParams()
     return output;
 }
 
+template<typename T>
+void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
+{
+    const auto workspace_size = decoder_layer_weights[0]->workspace_size();
+    char*      workspace{};
+
+    TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d\n", workspace_size);
+
+    if (workspace_size) {
+        deviceMalloc((char**)&workspace, workspace_size);
+    }
+    for (auto& layer : decoder_layer_weights) {
+        layer->prepare(workspace, workspace_size, prop);
+    }
+    deviceFree(workspace);
+}
+
 #ifdef ENABLE_FP32
 template struct LlamaWeight<float>;
 #endif
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index 65eb986d83..a180204ae2 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -51,6 +51,8 @@ struct LlamaWeight {
 
     TensorMap getParams();
 
+    void prepare(const cudaDeviceProp& prop);
+
     std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
     const T*                                 pre_decoder_embedding_table{};
     const T*                                 output_norm_weight{};
diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu
index 11be59d0c0..879a39d409 100644
--- a/src/turbomind/models/llama/llama_kernels.cu
+++ b/src/turbomind/models/llama/llama_kernels.cu
@@ -1,7 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/attention/array_ops.h"
-#include "src/turbomind/kernels/gemm_s_f16/common.h"
+#include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index 57e00ed7f1..9e2706270e 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -16,7 +16,7 @@
 
 namespace turbomind {
 
-CmpMode compare_mode = kCmpNone;
+CmpMode compare_mode = kCmpRead;
 
 template<typename T>
 struct abs_diff_t {
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 89c9cf4d76..05fff513be 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -170,10 +170,10 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
 
-    // static int count = 0;
+    static int count = 0;
 
     // if (layer_id == 0 && count == 0) {
-    //     Compare(attention_input, num_token * weights->qkv.input_dims, "qkv_input", kCmpRead, stream_);
+    //     Compare(attention_input, token_num * weights->qkv.input_dims, "qkv_input", compare_mode, stream_);
     // }
 
     int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
@@ -181,13 +181,38 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     /// qkv gemm
     // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
     linear_.forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+    sync_check_cuda_error();
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
 
     // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_, num_token * weights->qkv.output_dims, "qkv_buf", kCmpRead, stream_);
+    //     Compare(qkv_buf_, token_num * weights->qkv.output_dims, "qkv_buf", compare_mode, stream_);
     // }
 
+    if constexpr (0) {
+        std::vector<T> tmp(token_num * weights->qkv.output_dims);
+        cudaMemcpyAsync(tmp.data(), qkv_buf_, sizeof(T) * tmp.size(), cudaMemcpyDefault, stream_);
+        cudaStreamSynchronize(stream_);
+        int i = 0;
+        for (auto& x : tmp) {
+            std::cout << (float)x << " ";
+            if (++i == 256) {
+                break;
+            }
+        }
+        std::cout << "\n";
+        i = 0;
+        for (auto it = tmp.rbegin(); it != tmp.rend(); ++it) {
+            std::cout << (float)*it << " ";
+            if (++i == 256) {
+                break;
+            }
+        }
+        std::cout << "\n";
+    }
+
+    // FT_CHECK(0);
+
     auto stream_ptr = streams_.data();
 
     auto CreateParams = [&](int offset, int batch_size, int max_kv_splits, cudaStream_t stream) {
@@ -265,7 +290,6 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.stream = stream;
 
         params.quant_policy = quant_policy_;
-
         return params;
     };
 
@@ -321,6 +345,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
     linear_.forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+    sync_check_cuda_error();
 
     // ++count;
 
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 6b6bbba56c..f632830e5f 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -57,7 +57,7 @@ class UnifiedAttentionLayer {
                           NcclParam            tensor_para,
                           LoraParams           lora_params,
                           cudaStream_t         stream,
-                          cublasMMWrapper*     cublas_wrapper,
+                          LlamaLinear<T>       linear,
                           IAllocator*          allocator,
                           bool                 is_free_buffer_after_forward,
                           int                  cache_block_seq_len,
@@ -72,8 +72,7 @@ class UnifiedAttentionLayer {
         tensor_para_(tensor_para),
         lora_params_(lora_params),
         stream_(stream),
-        cublas_wrapper_(cublas_wrapper),
-        linear_(cublas_wrapper, stream),
+        linear_(linear),
         allocator_(allocator),
         kv_cache_block_len_(cache_block_seq_len),
         is_free_buffer_after_forward_(is_free_buffer_after_forward),
@@ -147,10 +146,9 @@ class UnifiedAttentionLayer {
 
     LoraParams lora_params_;
 
-    cudaStream_t     stream_;
-    IAllocator*      allocator_;
-    cublasMMWrapper* cublas_wrapper_;
-    LlamaLinear<T>   linear_;
+    cudaStream_t   stream_;
+    IAllocator*    allocator_;
+    LlamaLinear<T> linear_;
 
     cudaStream_t aux_stream_;
     cudaEvent_t  qkv_event_;
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index 914436b349..e29d42680d 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -43,7 +43,7 @@ void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
                                                tensor_para_,
                                                lora_params_,
                                                stream_,
-                                               cublas_wrapper_,
+                                               linear_,
                                                allocator_,
                                                is_free_buffer_after_forward_,
                                                cache_block_seq_len,
@@ -54,7 +54,7 @@ void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
                                       inter_size_,
                                       tensor_para_,
                                       stream_,
-                                      cublas_wrapper_,
+                                      linear_,
                                       allocator_,
                                       is_free_buffer_after_forward_);
 
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 7dde36cb9a..0a80b415d5 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -18,10 +18,10 @@ class UnifiedDecoder {
     void
     initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);
 
-    cudaStream_t     stream_;
-    cublasMMWrapper* cublas_wrapper_;
-    IAllocator*      allocator_;
-    bool             is_free_buffer_after_forward_{};
+    cudaStream_t   stream_;
+    LlamaLinear<T> linear_;
+    IAllocator*    allocator_;
+    bool           is_free_buffer_after_forward_{};
 
     size_t head_num_;
     size_t size_per_head_;
@@ -69,7 +69,7 @@ class UnifiedDecoder {
                    float                       rmsnorm_eps,
                    NcclParam                   tensor_para,
                    cudaStream_t                stream,
-                   cublasMMWrapper*            cublas_wrapper,
+                   LlamaLinear<T>              linear,
                    IAllocator*                 allocator,
                    const LoraParams&           lora_params,
                    bool                        is_free_buffer_after_forward,
@@ -77,7 +77,7 @@ class UnifiedDecoder {
                    int                         cache_block_seq_len,
                    int                         quant_policy):
         stream_(stream),
-        cublas_wrapper_(cublas_wrapper),
+        linear_(linear),
         allocator_(allocator),
         lora_params_(lora_params),
         is_free_buffer_after_forward_(is_free_buffer_after_forward),
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 2de1a1008e..973c023725 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -1,4 +1,5 @@
-#include "src/turbomind/kernels/gemm_s_f16/format.h"
+// Copyright (c) OpenMMLab. All rights reserved.
+
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
@@ -450,61 +451,28 @@ PYBIND11_MODULE(_turbomind, m)
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
+        .def(
+            "process_weight",
+            [](AbstractTransformerModel* model, int deviceId, int rank) { model->processWeights(deviceId, rank); },
+            py::call_guard<py::gil_scoped_release>(),
+            "device_id"_a,
+            "rank"_a)
+        .def(
+            "create_engine",
+            [](AbstractTransformerModel*                                         model,
+               int                                                               deviceId,
+               int                                                               rank,
+               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+               std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm = nullptr) {
+                model->createEngine(deviceId, rank, nccl_params, custom_all_reduce_comm);
+            },
+            py::call_guard<py::gil_scoped_release>(),
+            "device_id"_a,
+            "rank"_a,
+            "nccl_params"_a,
+            "custom_all_reduce_comm"_a = nullptr)
         .def("__str__", &AbstractTransformerModel::toString)
         .def("__repr__", &AbstractTransformerModel::toString)
         .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
         .def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize);
-
-    m.def("transpose_qk_s4_k_m8", [](py::object src, py::object dst, int m, int k, int size_per_head) {
-        auto src_tensor = GetDLTensor(src);
-        auto dst_tensor = GetDLTensor(dst);
-
-        turbomind::transpose_qk_s4_k_m8_hf(
-            (uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, size_per_head, nullptr);
-    });
-
-    m.def("fuse_w1_w3_s4_k_m8", [](py::object src, py::object dst, int m, int k) {
-        auto src_tensor = GetDLTensor(src);
-        auto dst_tensor = GetDLTensor(dst);
-
-        turbomind::fuse_w1_w3_s4_k_m8((uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, nullptr);
-    });
-
-    m.def("convert_s4_k_m8",
-          [](py::object A_dst,
-             py::object Q_dst,
-             py::object ws,
-             py::object A_src,
-             py::object scales,
-             py::object qzeros,
-             int        m,
-             int        k,
-             int        group_size) {
-              auto a_dst = GetDLTensor(A_dst);
-              auto q_dst = GetDLTensor(Q_dst);
-              auto w     = GetDLTensor(ws);
-              auto a_src = GetDLTensor(A_src);
-              auto s     = GetDLTensor(scales);
-              auto qz    = GetDLTensor(qzeros);
-
-              turbomind::convert_s4_k_m8((uint32_t*)a_dst.data,
-                                         (half2*)q_dst.data,
-                                         (half*)w.data,
-                                         (const uint32_t*)a_src.data,
-                                         (const half*)s.data,
-                                         (const uint32_t*)qz.data,
-                                         m,
-                                         k,
-                                         group_size,
-                                         nullptr);
-          });
-
-    m.def("dequantize_s4", [](py::object src, py::object dst) {
-        auto src_tensor = GetDLTensor(src);
-        auto dst_tensor = GetDLTensor(dst);
-        auto src_count  = std::accumulate(src_tensor.shape, src_tensor.shape + src_tensor.ndim, size_t{1});
-        auto dst_count  = std::accumulate(dst_tensor.shape, dst_tensor.shape + dst_tensor.ndim, size_t{1});
-        turbomind::FT_CHECK(src_count * 8 == dst_count);
-        turbomind::dequantize_s4((uint4*)dst_tensor.data, (uint32_t*)src_tensor.data, src_count, nullptr);
-    });
 }
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 87fd2cdf59..d025935bf7 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -25,6 +25,7 @@
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 #include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include <mutex>
 
 namespace ft = turbomind;
@@ -72,6 +73,7 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
         ft::FT_CHECK(false);
 #endif
     }
+    return nullptr;
 }
 
 template<typename T>
@@ -372,31 +374,21 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
 
 template<typename T>
 std::unique_ptr<AbstractTransformerModelInstance>
-LlamaTritonModel<T>::createModelInstance(int                                                               device_id,
-                                         int                                                               rank,
-                                         cudaStream_t                                                      stream,
-                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                                         std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+LlamaTritonModel<T>::createModelInstance(int          device_id,
+                                         int          rank,
+                                         cudaStream_t stream,
+                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>,
+                                         std::shared_ptr<ft::AbstractCustomComm>)
 {
     ft::check_cuda_error(cudaSetDevice(device_id));
-    // const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
-
-    std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance;
-    {
-        std::lock_guard<std::mutex> lock(shared_mutexes_[device_id]);
-        instance = shared_instances_[device_id];
-        if (!instance) {
-            instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
-            instance->llm->setFfiLock(ffi_lock_);
-            shared_instances_[device_id] = instance;
-        }
-    }
+
+    ft::FT_CHECK((bool)shared_instances_[device_id]);
 
     auto allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
 
     allocator->setStream(stream);
 
-    return std::make_unique<LlamaTritonModelInstance<T>>(instance, std::move(allocator), device_id);
+    return std::make_unique<LlamaTritonModelInstance<T>>(shared_instances_[device_id], std::move(allocator), device_id);
 }
 
 template<typename T>
@@ -439,6 +431,36 @@ TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
     return result;
 }
 
+template<typename T>
+void LlamaTritonModel<T>::processWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    ft::FT_CHECK(shared_weights_[device_id] != nullptr);
+
+    cudaDeviceProp props{};
+    ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id));
+
+    shared_weights_[device_id]->prepare(props);
+    ft::sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createEngine(int                                                               device_id,
+                                       int                                                               rank,
+                                       std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                                       std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+{
+
+    auto instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
+    instance->llm->setFfiLock(ffi_lock_);
+
+    if (weight_type_ == ft::WeightType::kINT4) {
+        instance->llm->tune();
+    }
+
+    shared_instances_[device_id] = std::move(instance);
+}
+
 template<typename T>
 std::string LlamaTritonModel<T>::toString()
 {
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index c0a0ebf3a9..fc7cfca0f2 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -56,6 +56,13 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     TensorMap getParams(int deviceId, int rank) override;
 
+    void processWeights(int deviceId, int rank) override;
+
+    void createEngine(int                                                               device_id,
+                      int                                                               rank,
+                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                      std::shared_ptr<ft::AbstractCustomComm>) override;
+
     void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
                            int                                                   world_size) override;
 
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index 8bf5ec54f5..b48bc9a1d2 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -328,6 +328,13 @@ struct AbstractTransformerModel {
 
     virtual TensorMap getParams(int deviceId, int rank) = 0;
 
+    virtual void processWeights(int deviceId, int rank) = 0;
+
+    virtual void createEngine(int                                                               device_id,
+                              int                                                               rank,
+                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                              std::shared_ptr<ft::AbstractCustomComm>) = 0;
+
     virtual std::string toString()            = 0;
     virtual int         getTensorParaSize()   = 0;
     virtual int         getPipelineParaSize() = 0;
diff --git a/src/turbomind/utils/CMakeLists.txt b/src/turbomind/utils/CMakeLists.txt
index fe32b6857c..da3d44c26d 100644
--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -102,3 +102,6 @@ add_library(anomaly_handler STATIC anomaly_handler.cu)
 set_property(TARGET anomaly_handler PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET anomaly_handler PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(anomaly_handler PUBLIC cuda_utils logger)
+
+add_library(parser STATIC parser.cc)
+set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE  ON)
diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc
index 45fa06a6d5..db783c5637 100644
--- a/src/turbomind/utils/cuda_utils.cc
+++ b/src/turbomind/utils/cuda_utils.cc
@@ -17,6 +17,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_fp8_utils.h"
+#include <regex>
 
 namespace turbomind {
 
@@ -392,5 +393,11 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
     return model_file_type;
 }
 
+bool is_16xx_series(const char* name)
+{
+    const std::regex re(R"(GTX 16\d\d)");
+    return std::regex_search(name, re);
+}
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index f066a0c25b..533263604e 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -483,5 +483,7 @@ void compareTwoTensor(
     delete[] h_ref;
 }
 
+bool is_16xx_series(const char* name);
+
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/parser.cc b/src/turbomind/utils/parser.cc
new file mode 100644
index 0000000000..6c1e19a835
--- /dev/null
+++ b/src/turbomind/utils/parser.cc
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <iostream>
+#include <regex>
+#include <string>
+#include <vector>
+
+namespace turbomind {
+
+std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str)
+{
+    const std::regex regex(R"((\w+)=([^,\[\(]+|\[.*\]|\(.*\)))");
+
+    std::sregex_iterator beg(str.begin(), str.end(), regex);
+    std::sregex_iterator end{};
+
+    std::vector<std::pair<std::string, std::string>> ret;
+    for (auto it = beg; it != end; ++it) {
+        std::smatch match = *it;
+        ret.emplace_back(match[1], match[2]);
+    }
+
+    return ret;
+}
+
+std::vector<std::string> ParseListOrTuple(const std::string& str)
+{
+    const std::regex regex(R"([,\[\]\(\)]+)");
+
+    std::vector<std::string> ret;
+    std::copy_if(std::sregex_token_iterator(str.begin(), str.end(), regex, -1),
+                 std::sregex_token_iterator{},
+                 std::back_inserter(ret),
+                 [](const std::string& s) { return !s.empty(); });
+
+    return ret;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/utils/parser.h b/src/turbomind/utils/parser.h
new file mode 100644
index 0000000000..d72b14dad7
--- /dev/null
+++ b/src/turbomind/utils/parser.h
@@ -0,0 +1,30 @@
+#include <string>
+#include <vector>
+
+namespace turbomind {
+
+std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str);
+
+std::vector<std::string> ParseListOrTuple(const std::string& str);
+
+inline void Parse(int& value, const std::string& str)
+{
+    value = std::stoi(str);
+}
+
+inline void Parse(float& value, const std::string& str)
+{
+    value = std::stof(str);
+}
+
+template<class T>
+void Parse(std::vector<T>& xs, const std::string& str)
+{
+    const auto ss = ParseListOrTuple(str);
+    for (const auto& s : ss) {
+        xs.emplace_back();
+        Parse(xs.back(), s);
+    }
+}
+
+}  // namespace turbomind
diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
index fabc0ff6e0..95b3e691a6 100644
--- a/tests/test_lmdeploy/test_turbomind/test_converter.py
+++ b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -7,42 +7,41 @@
 
 def test_registered_models():
     for model, model_format, group_size, weight_type, register_name in [
-        ('internlm/internlm2-7b', 'hf', 0, 'bf16', 'bf16'),
-        ('baichuan-inc/Baichuan-7B', 'hf', 0, 'fp16', 'fp16'),
-        ('baichuan-inc/Baichuan2-7B-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('baichuan-inc/Baichuan-13B-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('baichuan-inc/Baichuan2-13B-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('internlm/internlm-chat-7b', 'hf', 0, 'fp16', 'fp16'),
-        ('internlm/internlm2-chat-7b', 'hf', 0, 'bf16', 'bf16'),
-        ('internlm/internlm-xcomposer2-4khd-7b', 'hf', 0, 'bf16', 'plora'),
-        ('internlm/internlm-xcomposer2-vl-7b', 'hf', 0, 'bf16', 'plora'),
-        ('internlm/internlm-xcomposer2-7b', 'hf', 0, 'bf16', 'plora'),
-        ('lmsys/vicuna-7b-v1.5', 'hf', 0, 'fp16', 'fp16'),
-        ('01-ai/Yi-1.5-9B', 'hf', 0, 'bf16', 'bf16'),
-        ('deepseek-ai/deepseek-coder-6.7b-instruct', 'hf', 0, 'bf16', 'bf16'),
-        ('deepseek-ai/deepseek-llm-7b-chat', 'hf', 0, 'bf16', 'bf16'),
-        ('Qwen/Qwen-7B-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('Qwen/Qwen1.5-7B-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('Qwen/Qwen2-7B-Instruct', 'hf', 0, 'bf16', 'bf16'),
-        ('Qwen/Qwen-VL-Chat', 'hf', 0, 'bf16', 'bf16'),
-        ('liuhaotian/llava-v1.6-34b', 'hf', 0, 'bf16', 'bf16'),
-        ('liuhaotian/llava-v1.6-mistral-7b', 'hf', 0, 'bf16', 'bf16'),
-        ('liuhaotian/llava-v1.6-vicuna-13b', 'hf', 0, 'bf16', 'bf16'),
-        ('OpenGVLab/InternVL-Chat-V1-5', 'hf', 0, 'bf16', 'bf16'),
-        ('deepseek-ai/deepseek-vl-7b-chat', 'hf', 0, 'fp16', 'fp16'),
-        ('YanweiLi/MGM-7B', 'hf', 0, 'bf16', 'bf16'),
-        ('Qwen/Qwen1.5-4B-Chat-AWQ', 'awq', 128, 'int4', 'w4'),
+        ('internlm/internlm2-7b', 'hf', 0, 'bf16', 'tm'),
+        ('baichuan-inc/Baichuan-7B', 'hf', 0, 'fp16', 'tm'),
+        ('baichuan-inc/Baichuan2-7B-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('baichuan-inc/Baichuan-13B-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('baichuan-inc/Baichuan2-13B-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('internlm/internlm-chat-7b', 'hf', 0, 'fp16', 'tm'),
+        ('internlm/internlm2-chat-7b', 'hf', 0, 'bf16', 'tm'),
+        ('internlm/internlm-xcomposer2-4khd-7b', 'hf', 0, 'bf16', 'tm'),
+        ('internlm/internlm-xcomposer2-vl-7b', 'hf', 0, 'bf16', 'tm'),
+        ('internlm/internlm-xcomposer2-7b', 'hf', 0, 'bf16', 'tm'),
+        ('lmsys/vicuna-7b-v1.5', 'hf', 0, 'fp16', 'tm'),
+        ('01-ai/Yi-1.5-9B', 'hf', 0, 'bf16', 'tm'),
+        ('deepseek-ai/deepseek-coder-6.7b-instruct', 'hf', 0, 'bf16', 'tm'),
+        ('deepseek-ai/deepseek-llm-7b-chat', 'hf', 0, 'bf16', 'tm'),
+        ('Qwen/Qwen-7B-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('Qwen/Qwen1.5-7B-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('Qwen/Qwen2-7B-Instruct', 'hf', 0, 'bf16', 'tm'),
+        ('Qwen/Qwen-VL-Chat', 'hf', 0, 'bf16', 'tm'),
+        ('liuhaotian/llava-v1.6-34b', 'hf', 0, 'bf16', 'tm'),
+        ('liuhaotian/llava-v1.6-mistral-7b', 'hf', 0, 'bf16', 'tm'),
+        ('liuhaotian/llava-v1.6-vicuna-13b', 'hf', 0, 'bf16', 'tm'),
+        ('OpenGVLab/InternVL-Chat-V1-5', 'hf', 0, 'bf16', 'tm'),
+        ('deepseek-ai/deepseek-vl-7b-chat', 'hf', 0, 'fp16', 'tm'),
+        ('YanweiLi/MGM-7B', 'hf', 0, 'bf16', 'tm'),
+        ('Qwen/Qwen1.5-4B-Chat-AWQ', 'awq', 128, 'int4', 'tm'),
         ('solidrust/Meta-Llama-3-8B-Instruct-hf-AWQ', 'awq', 128,
-         'int4', 'w4'),
-        ('internlm/internlm2-chat-20b-4bits', 'awq', 128, 'int4', 'w4'),
-        ('internlm/internlm-xcomposer2-vl-7b-4bit', 'awq', 128, 'int4',
-         'plora-w4')
+         'int4', 'tm'),
+        ('internlm/internlm2-chat-20b-4bits', 'awq', 128, 'int4', 'tm'),
+        ('internlm/internlm-xcomposer2-vl-7b-4bit', 'awq', 128, 'int4', 'tm')
     ]:
         input_name = get_input_model_registered_name(model,
                                                      model_format=model_format)
         assert input_name in list(INPUT_MODELS.module_dict.keys())
 
-        output_name, config = get_output_model_registered_name_and_config(
+        output_name, config, _ = get_output_model_registered_name_and_config(
             model, model_format=model_format, group_size=0)
         assert output_name == register_name
         assert config.group_size == group_size
@@ -53,7 +52,7 @@ def test_registered_models():
 
 def test_update_from_engine_config():
     import copy
-    _, _config = get_output_model_registered_name_and_config(
+    _, _config, _ = get_output_model_registered_name_and_config(
         'internlm/internlm2-chat-7b', model_format='hf', group_size=0)
     config = copy.deepcopy(_config)
     config.update_from_engine_config(None)

From 6522a8746ed69896a0a50beba9984a5ccdc85330 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 20 Aug 2024 14:26:28 +0800
Subject: [PATCH 22/39] Update error status_code to raise error in openai
 client (#2333)

* Update error status_code to raise error in openai client

* remove strict
---
 lmdeploy/serve/openai/api_server.py | 15 +++++++++------
 lmdeploy/serve/openai/protocol.py   |  4 +++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 1b64f877cc..040f03ec2a 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -96,24 +96,27 @@ def available_models():
     return ModelList(data=model_cards)
 
 
-def create_error_response(status: HTTPStatus, message: str):
+def create_error_response(status: HTTPStatus,
+                          message: str,
+                          error_type='invalid_request_error'):
     """Create error response according to http status and message.
 
     Args:
         status (HTTPStatus): HTTP status codes and reason phrases
         message (str): error message
+        error_type (str): error type
     """
-    return JSONResponse(
-        ErrorResponse(message=message,
-                      type='invalid_request_error',
-                      code=status.value).model_dump())
+    return JSONResponse(ErrorResponse(message=message,
+                                      type=error_type,
+                                      code=status.value).model_dump(),
+                        status_code=status.value)
 
 
 async def check_request(request) -> Optional[JSONResponse]:
     """Check if a request is valid."""
     if hasattr(request, 'model') and request.model not in get_model_list():
         return create_error_response(
-            HTTPStatus.BAD_REQUEST,
+            HTTPStatus.NOT_FOUND,
             f'The model `{request.model}` does not exist.')
     if hasattr(request, 'n') and request.n <= 0:
         return create_error_response(
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index b1f83139e7..57eecc45f3 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -10,9 +10,11 @@
 
 class ErrorResponse(BaseModel):
     """Error responses."""
-    object: str = 'error'
     message: str
+    type: str
     code: int
+    param: Optional[str] = None
+    object: str = 'error'
 
 
 class ModelPermission(BaseModel):

From 12112e165833d743023ed9f1c53a144f3b7110fb Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Tue, 20 Aug 2024 14:36:03 +0800
Subject: [PATCH 23/39] Change to use device instead of device-type in cli
 (#2337)

* remove device-type in cli

* remove device arg from lite cli
---
 lmdeploy/cli/cli.py   |  6 +++---
 lmdeploy/cli/lite.py  |  3 ---
 lmdeploy/cli/serve.py |  8 ++++----
 lmdeploy/cli/utils.py | 19 ++++++-------------
 4 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index f33c276d5d..59fb2f3453 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -111,7 +111,7 @@ def add_parser_chat():
         # pytorch engine args
         pt_group = parser.add_argument_group('PyTorch engine arguments')
         ArgumentHelper.adapters(pt_group)
-        ArgumentHelper.device_type(pt_group)
+        ArgumentHelper.device(pt_group)
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
@@ -250,7 +250,7 @@ def chat(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device_type)
+                device_type=args.device)
             run_chat(args.model_path,
                      engine_config,
                      chat_template_config=chat_template_config)
@@ -259,7 +259,7 @@ def chat(args):
             kwargs = convert_args(args)
             kwargs.pop('chat_template')
             kwargs.pop('backend')
-            kwargs.pop('device_type')
+            kwargs.pop('device')
             kwargs['chat_template_config'] = chat_template_config
             run_chat(**kwargs)
 
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index 56b0ebe07c..9aa6000505 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -35,7 +35,6 @@ def add_parser_auto_awq():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
-        ArgumentHelper.device(parser)
         parser.add_argument('--w-bits',
                             type=int,
                             default=4,
@@ -67,7 +66,6 @@ def add_parser_calibrate():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
-        ArgumentHelper.device(parser)
 
     @staticmethod
     def add_parser_smooth_quant():
@@ -91,7 +89,6 @@ def add_parser_smooth_quant():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
-        ArgumentHelper.device(parser)
 
     @staticmethod
     def auto_awq(args):
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index d615b815b9..c797802127 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -58,7 +58,7 @@ def add_parser_gradio():
 
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
-        ArgumentHelper.device_type(pt_group)
+        ArgumentHelper.device(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -144,7 +144,7 @@ def add_parser_api_server():
         pt_group = parser.add_argument_group('PyTorch engine arguments')
 
         ArgumentHelper.adapters(pt_group)
-        ArgumentHelper.device_type(pt_group)
+        ArgumentHelper.device(pt_group)
         # common engine args
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
@@ -211,7 +211,7 @@ def gradio(args):
                 block_size=args.cache_block_seq_len,
                 session_len=args.session_len,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device_type)
+                device_type=args.device)
         else:
             backend_config = TurbomindEngineConfig(
                 tp=args.tp,
@@ -254,7 +254,7 @@ def api_server(args):
                 session_len=args.session_len,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device_type)
+                device_type=args.device)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 37a3ef7f0b..b44298a6d9 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -349,14 +349,16 @@ def calib_search_scale(parser):
         )
 
     @staticmethod
-    def device(parser):
+    def device(parser,
+               default: str = 'cuda',
+               choices: List[str] = ['cuda', 'ascend']):
         """Add argument device to parser."""
 
         return parser.add_argument('--device',
                                    type=str,
-                                   default='cuda',
-                                   choices=['cuda', 'cpu'],
-                                   help='Device type of running')
+                                   default=default,
+                                   choices=choices,
+                                   help='The device type of running')
 
     @staticmethod
     def chat_template(parser):
@@ -452,12 +454,3 @@ def vision_max_batch_size(parser):
                                    type=int,
                                    default=1,
                                    help='the vision model batch size')
-
-    @staticmethod
-    def device_type(parser, default: str = 'cuda'):
-        return parser.add_argument(
-            '--device-type',
-            type=str,
-            default=default,
-            choices=['cuda', 'ascend'],
-            help='The inference device type for pytorch engine.')

From 8ed696c3552c033e0a35bab8b60cb98fcd60e62d Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 20 Aug 2024 16:17:05 +0800
Subject: [PATCH 24/39] Add GEMM test utils (#2342)

* add test utils

* lint

* fix msvc build

* fix msvc build

* lint
---
 src/turbomind/kernels/gemm/gemm.cu            | 14 ++++-
 src/turbomind/kernels/gemm/test/gemm_bench.cu | 14 +----
 src/turbomind/kernels/gemm/test/gemm_test.cu  | 36 ++++++++---
 src/turbomind/kernels/gemm/test/models.h      | 19 ++++++
 .../kernels/gemm/test/quantization_impl.h     |  2 +-
 src/turbomind/kernels/gemm/test/test_utils.cu | 57 +++++++++++++++++-
 src/turbomind/kernels/gemm/test/test_utils.h  |  5 ++
 src/turbomind/kernels/gemm/test/testbed.h     | 59 +++++++++++++++++--
 src/turbomind/kernels/gemm/types.h            |  1 +
 src/turbomind/models/llama/LlamaLinear.cu     |  3 +-
 10 files changed, 179 insertions(+), 31 deletions(-)
 create mode 100644 src/turbomind/kernels/gemm/test/models.h

diff --git a/src/turbomind/kernels/gemm/gemm.cu b/src/turbomind/kernels/gemm/gemm.cu
index c6d3739a1e..59cd148182 100644
--- a/src/turbomind/kernels/gemm/gemm.cu
+++ b/src/turbomind/kernels/gemm/gemm.cu
@@ -341,7 +341,7 @@ int Gemm::Run(const Operation&    operation,
         k,
     };
 
-    const auto launch = [&](LaunchSpec spec, cudaStream_t st) {
+    const auto launch = [=](LaunchSpec spec, cudaStream_t st) {
         auto _workspace = workspace;
         return spec.kernel->Launch(operation,
                                    alpha,
@@ -364,6 +364,18 @@ int Gemm::Run(const Operation&    operation,
                                    st);
     };
 
+    if (operation.reserved) {
+        auto specs = impl_->Find(desc, workspace.barriers_size, workspace.partials_size, 0);
+        auto cases = (std::vector<std::function<LaunchSpec()>>*)operation.reserved;
+        for (const auto& spec : specs) {
+            cases->push_back([=] {
+                launch(spec, stream);
+                return spec;
+            });
+        }
+        return -1;
+    }
+
     LaunchSpec spec{};
 
     if (operation.dispatch & DispatchPolicy::kMeasure) {
diff --git a/src/turbomind/kernels/gemm/test/gemm_bench.cu b/src/turbomind/kernels/gemm/test/gemm_bench.cu
index 3295d2e1a6..216d6d9d83 100644
--- a/src/turbomind/kernels/gemm/test/gemm_bench.cu
+++ b/src/turbomind/kernels/gemm/test/gemm_bench.cu
@@ -2,24 +2,12 @@
 
 #include "nvbench/main.cuh"
 #include "src/turbomind/kernels/gemm/operand.h"
+#include "src/turbomind/kernels/gemm/test/models.h"
 #include "src/turbomind/kernels/gemm/test/testbed.h"
 #include <map>
 #include <nvbench/nvbench.cuh>
 #include <string>
 
-std::vector<std::pair<int64_t, int64_t>> config{
-    {11008 * 2, 4096}, {4096, 11008}, {12288, 4096}, {4096, 4096},  // llama2-7b
-    {14336 * 2, 4096}, {4096, 14336}, {6144, 4096},  {4096, 4096},  // llama3-8b / internlm2.5-7b
-    {16384 * 2, 6144}, {6144, 16384}, {8192, 6144},  {6144, 6144},  // internlm2-20b
-    {13696 * 2, 4096}, {4096, 13696}, {4608, 4096},  {4096, 4096},  // glm4-9b
-    {18944 * 2, 3584}, {3584, 18944}, {4608, 3584},  {3584, 3584},  // qwen2-7b
-    {20480 * 2, 7168}, {7168, 20480}, {9216, 7168},  {7168, 7168},  // yi-34b
-    {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192},  // llama2-70b / llama3-70b
-    {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192}   // qwen2-72b-instruct-awq
-};
-
-// {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192},  // qwen2-72b
-
 void gemm_bench(nvbench::state& state)
 {
     const auto idx = state.get_int64("idx");
diff --git a/src/turbomind/kernels/gemm/test/gemm_test.cu b/src/turbomind/kernels/gemm/test/gemm_test.cu
index 224d61f193..f783e7d284 100644
--- a/src/turbomind/kernels/gemm/test/gemm_test.cu
+++ b/src/turbomind/kernels/gemm/test/gemm_test.cu
@@ -1,3 +1,4 @@
+// Copyright (c) OpenMMLab. All rights reserved.
 
 #include "src/turbomind/kernels/attention/quantization.h"
 
@@ -5,6 +6,7 @@
 #include "src/turbomind/kernels/gemm/gemm.h"
 #include "src/turbomind/kernels/gemm/gpu_metric.h"
 #include "src/turbomind/kernels/gemm/kernel.h"
+#include "src/turbomind/kernels/gemm/test/models.h"
 #include "src/turbomind/kernels/gemm/test/quantization.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/kernels/gemm/test/testbed.h"
@@ -13,6 +15,8 @@
 #include <limits>
 #include <thrust/universal_vector.h>
 
+#include <numeric>
+#include <random>
 #include <type_traits>
 
 using namespace turbomind;
@@ -34,6 +38,8 @@ void ComputeRefCpu(half* C, const half* A, const half* B, int m, int n, int k)
     }
 }
 
+static int g_check = 0;
+
 void Run(int batch_size, int output_dims, int input_dims, int g = 128)
 {
     auto& test = get_test();
@@ -46,20 +52,36 @@ void Run(int batch_size, int output_dims, int input_dims, int g = 128)
     std::cerr << "m" << m << "n" << n << "k" << k << "\n";
     test.Initialize(m, n, k, g, 0);
 
-    for (int i = 0; i < 10; ++i) {
-        test.Run();
+    if (g_check) {
+        test.Check();
+    }
+    else {
+        for (int i = 0; i < 10; ++i) {
+            test.Run();
+        }
+        test.CompareC();
     }
-
-    // test.CompareB();
-    test.CompareC();
-
-    return;
 }
 
 int main(int argc, char* argv[])
 {
+    g_check = 0;
     Run(16384, 16384, 16384);
 
+    // g_check = 1;
+    // std::vector<int> bsz(1024);
+    // {
+    //     std::iota(bsz.begin(), bsz.end(), 1);
+    //     std::random_device rd;
+    //     std::mt19937       g(rd());
+    //     std::shuffle(bsz.begin() + 1, bsz.end(), g);
+    // }
+    // for (const auto& b : bsz) {
+    //     for (const auto& [out, in] : config) {
+    //         Run(b, out, in);
+    //     }
+    // }
+
     if (auto ec = cudaDeviceSynchronize(); ec != cudaSuccess) {
         std::cerr << "un-clean exit: " << cudaGetErrorString(ec) << "\n";
     }
diff --git a/src/turbomind/kernels/gemm/test/models.h b/src/turbomind/kernels/gemm/test/models.h
new file mode 100644
index 0000000000..8a6260fef8
--- /dev/null
+++ b/src/turbomind/kernels/gemm/test/models.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+static const std::vector<std::pair<int64_t, int64_t>> config{
+    {11008 * 2, 4096}, {4096, 11008}, {12288, 4096}, {4096, 4096},  // llama2-7b
+    {14336 * 2, 4096}, {4096, 14336}, {6144, 4096},  {4096, 4096},  // llama3-8b / internlm2.5-7b
+    {16384 * 2, 6144}, {6144, 16384}, {8192, 6144},  {6144, 6144},  // internlm2-20b
+    {13696 * 2, 4096}, {4096, 13696}, {4608, 4096},  {4096, 4096},  // glm4-9b
+    {18944 * 2, 3584}, {3584, 18944}, {4608, 3584},  {3584, 3584},  // qwen2-7b
+    {20480 * 2, 7168}, {7168, 20480}, {9216, 7168},  {7168, 7168},  // yi-34b
+    {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192},  // llama2-70b / llama3-70b
+    {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192}   // qwen2-72b-instruct-awq
+};
+// {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192},  // qwen2-72b
diff --git a/src/turbomind/kernels/gemm/test/quantization_impl.h b/src/turbomind/kernels/gemm/test/quantization_impl.h
index 992e5a3c3b..27dfcac5a1 100644
--- a/src/turbomind/kernels/gemm/test/quantization_impl.h
+++ b/src/turbomind/kernels/gemm/test/quantization_impl.h
@@ -193,7 +193,7 @@ void Quantize(const thrust::universal_vector<S>&  x,
 
     cudaStreamSynchronize(stream);
 
-    Compare(_x_p.data().get(), _x.data().get(), k, k, m);
+    // Compare(_x_p.data().get(), _x.data().get(), k, k, m);
 
     const int kg = ceil_div(k, group_size);
     for (int i = 0; i < m * kg; ++i) {
diff --git a/src/turbomind/kernels/gemm/test/test_utils.cu b/src/turbomind/kernels/gemm/test/test_utils.cu
index f6e3915fa6..8f2b4007f6 100644
--- a/src/turbomind/kernels/gemm/test/test_utils.cu
+++ b/src/turbomind/kernels/gemm/test/test_utils.cu
@@ -12,6 +12,11 @@
 #include <cooperative_groups/memcpy_async.h>
 #include <cooperative_groups/reduce.h>
 
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/transform_reduce.h>
+
 namespace turbomind {
 
 cublasHandle_t cublas_handle{};
@@ -33,7 +38,7 @@ void Compare(const T* src, const T* ref, size_t stride, int dims, int bsz, bool
             //     std::cout << x << "\t" << y << std::endl;
             // }
             auto abs_diff = std::abs(x - y);
-            auto rel_diff = abs_diff / std::abs(y + 1e-6f);
+            auto rel_diff = abs_diff / (std::max(std::abs(y), std::abs(x)) + 1e-8f);
             if (!(abs_diff <= atol + rtol * std::abs(y))) {
                 ++outliers;
                 if (show) {
@@ -46,8 +51,10 @@ void Compare(const T* src, const T* ref, size_t stride, int dims, int bsz, bool
         asums += abs_diff_sum / dims;
         rsums += rel_diff_sum / dims;
     }
-    std::cout << "abs_diff = " << asums / bsz << " rel_diff = " << rsums / bsz
-              << " outliers = " << outliers / (float)bsz << std::endl;
+    const float abs_diff = asums / bsz;
+    const float rel_diff = rsums / bsz;
+    const float outlier  = outliers / (float)bsz;
+    std::cout << "abs_diff = " << abs_diff << " rel_diff = " << rel_diff << " outliers = " << outlier << std::endl;
 }
 
 template void
@@ -65,6 +72,50 @@ template void Compare(const nv_bfloat16* src,
                       float              atol);
 #endif
 
+template<class T>
+std::vector<float>
+FastCompare(const T* src, const T* ref, int dims, int bsz, cudaStream_t stream, float rtol, float atol)
+{
+    auto       zip_iter = thrust::make_zip_iterator(src, ref);
+    const auto count    = (size_t)dims * bsz;
+    // nvcc-11.8: __host__ __device__ lambda can't be generic
+    using Tuple = thrust::tuple<float, float, float, float, float, float, int64_t>;
+    auto res    = thrust::transform_reduce(
+        thrust::cuda::par.on(stream),
+        zip_iter,
+        zip_iter + count,
+        [=] __device__(auto tup) {
+            float   s        = thrust::get<0>(tup);
+            float   r        = thrust::get<1>(tup);
+            float   abs_diff = fabsf(s - r);
+            float   abs_s    = fabsf(s);
+            float   abs_r    = fabsf(r);
+            float   rel_diff = abs_diff / (fmaxf(abs_r, abs_s) + 1e-8f);
+            int64_t outlier  = !(abs_diff <= (atol + rtol * abs_r));
+            return thrust::make_tuple(abs_s, abs_r, abs_diff, abs_diff, rel_diff, rel_diff, outlier);
+        },
+        thrust::make_tuple(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0LL),
+        [] __host__ __device__(const Tuple& a, const Tuple& b) {  // `__host__`: compiler needs the return type
+            return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b),
+                                      thrust::get<1>(a) + thrust::get<1>(b),
+                                      thrust::get<2>(a) + thrust::get<2>(b),
+                                      fmaxf(thrust::get<3>(a), thrust::get<3>(b)),
+                                      thrust::get<4>(a) + thrust::get<4>(b),
+                                      fmaxf(thrust::get<5>(a), thrust::get<5>(b)),
+                                      thrust::get<6>(a) + thrust::get<6>(b));
+        });
+    return {thrust::get<0>(res) / dims / bsz,   // avg abs src
+            thrust::get<1>(res) / dims / bsz,   // avg abs ref
+            thrust::get<2>(res) / dims / bsz,   // avg abs diff
+            thrust::get<3>(res),                // max abs diff
+            thrust::get<4>(res) / dims / bsz,   // avg rel diff
+            thrust::get<5>(res),                // max rel diff
+            (float)thrust::get<6>(res) / bsz};  // outlier count
+}
+
+template std::vector<float>
+FastCompare(const half* src, const half* ref, int dims, int bsz, cudaStream_t stream, float rtol, float atol);
+
 void LoadBinary(const std::string& path, size_t size, void* dst)
 {
     std::ifstream ifs(path, std::ios::binary | std::ios::in);
diff --git a/src/turbomind/kernels/gemm/test/test_utils.h b/src/turbomind/kernels/gemm/test/test_utils.h
index 27281a4a47..00401fe7ef 100644
--- a/src/turbomind/kernels/gemm/test/test_utils.h
+++ b/src/turbomind/kernels/gemm/test/test_utils.h
@@ -5,6 +5,7 @@
 #include "src/turbomind/macro.h"
 #include <cuda_fp16.h>
 #include <memory>
+#include <vector>
 
 namespace turbomind {
 
@@ -18,6 +19,10 @@ void Compare(const T* src,
              float    rtol = 1e-2,
              float    atol = 1e-4);
 
+template<class T>
+std::vector<float>
+FastCompare(const T* src, const T* ref, int dims, int bsz, cudaStream_t stream, float rtol = 1e-2, float atol = 1e-4);
+
 void LoadBinary(const std::string& path, size_t size, void* dst);
 
 class RNG {
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 3ca1239729..6e586a18b0 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -4,12 +4,15 @@
 
 #include "src/turbomind/kernels/core/array.h"
 #include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/gemm/desc.h"
 #include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/kernel.h"
 #include "src/turbomind/kernels/gemm/test/quantization.h"
 #include "src/turbomind/kernels/gemm/test/reference.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
+#include <algorithm>
 #include <cstdlib>
 #include <fstream>
 #include <iomanip>
@@ -207,7 +210,7 @@ class Testbed {
         }
     }
 
-    void Run()
+    void Run(void* ctx = {})
     {
         const Operation operation{
             dispatch_policy_,
@@ -215,6 +218,7 @@ class Testbed {
             quant_a_,
             quant_b_,
             kBatchDim,
+            ctx,
         };
 
         const Workspace workspace{barriers_.data().get(), barriers_.size(), partials_.data().get(), partials_.size()};
@@ -237,7 +241,7 @@ class Testbed {
                                 workspace,
                                 stream_);
 
-        if (status) {
+        if (!ctx && status) {
             std::cerr << "Run failed, code =" << status << "\n";
             std::abort();
         }
@@ -280,11 +284,54 @@ class Testbed {
 
         // Compare(c_.data().get(), c_f_.data().get(), n_, n_, m_, 0);
 
+        int dims = m_, bsz = n_;
         if (order_c == kRowMajor) {
-            Compare(c_.data().get(), c_ref_.data().get(), n_, n_, m_, 0);
+            std::swap(dims, bsz);
         }
-        else {
-            Compare(c_.data().get(), c_ref_.data().get(), m_, m_, n_, 0);
+        Compare(c_.data().get(), c_ref_.data().get(), dims, dims, bsz, 0);
+    }
+
+    void Check()
+    {
+        reference_.gemm(a_f_.data().get(),  //
+                        a_desc_,
+                        b_f_.data().get(),
+                        b_desc_,
+                        c_ref_.data().get(),
+                        c_desc_);
+
+        std::vector<std::function<LaunchSpec()>> cases;
+        Run(&cases);
+
+        int dims = m_, bsz = n_;
+        if (order_c == kRowMajor) {
+            std::swap(dims, bsz);
+        }
+
+        max_vals_.resize(7);
+
+        auto infnan  = [](float x) { return std::isinf(x) || std::isnan(x); };
+        auto greater = [](auto& a, auto& b) {
+            // skip abs(src) & abs(ref)
+            for (int i = 2; i < (int)b.size(); ++i) {
+                if (a[i] > b[i]) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        for (const auto& c : cases) {
+            const auto spec = c();
+            auto       diff = FastCompare(c_.data().get(), c_ref_.data().get(), dims, bsz, stream_);
+            if (greater(diff, max_vals_) || std::any_of(diff.begin(), diff.end(), infnan)) {
+                std::cout << spec.kernel->name() << " " << spec.splits << " " << spec.swizzle      //
+                          << " " << diff[0] << " " << diff[1] << " " << diff[2] << " " << diff[3]  //
+                          << " " << diff[4] << " " << diff[5] << " " << diff[6] << "\n";
+                for (int i = 0; i < (int)max_vals_.size(); ++i) {
+                    max_vals_[i] = std::max(max_vals_[i], diff[i]);
+                }
+            }
         }
     }
 
@@ -356,6 +403,8 @@ class Testbed {
     Reference      reference_;
     DispatchPolicy dispatch_policy_;
     std::string    cache_path_;
+
+    std::vector<float> max_vals_;
 };
 
 template<class T>
diff --git a/src/turbomind/kernels/gemm/types.h b/src/turbomind/kernels/gemm/types.h
index 6821de0134..133fdafe34 100644
--- a/src/turbomind/kernels/gemm/types.h
+++ b/src/turbomind/kernels/gemm/types.h
@@ -214,6 +214,7 @@ struct Operation {
     QuantDesc      quant_a;
     QuantDesc      quant_b;
     int            batch_dim;
+    void*          reserved;
 };
 
 struct MatrixLayout {
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
index 5b9f743174..9f80bdab0c 100644
--- a/src/turbomind/models/llama/LlamaLinear.cu
+++ b/src/turbomind/models/llama/LlamaLinear.cu
@@ -113,7 +113,8 @@ struct LlamaLinear<T>::Impl {
                                   type == kFusedSiluFfn ? Epilogue::kGatedSilu : Epilogue::kNone,
                                   {QuantType::kNone},
                                   {QuantType::kDefault, weight.group_size},
-                                  0};
+                                  0,
+                                  nullptr};
 
         const MatrixLayout a_desc{
             get_data_type_v<T>,

From 6f810137b8c6398efd78d202fe383922c306c3bc Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 20 Aug 2024 16:18:27 +0800
Subject: [PATCH 25/39] add environment variable to turn off silu fusion
 (#2343)

---
 .../models/llama/LlamaDecoderLayerWeight.cc   | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 30cc363c41..701d8a9a03 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -33,6 +33,25 @@
 
 namespace turbomind {
 
+static bool is_fuse_silu_act()
+{
+    static const bool value = [] {
+        const auto str = std::getenv("TM_FUSE_SILU_ACT");
+        if (str) {
+            try {
+                auto v = std::stoi(str) != 0;
+                TM_LOG_INFO("TM_FUSE_SILU_ACT=%d", (int)v);
+                return v;
+            }
+            catch (...) {
+            }
+        }
+        TM_LOG_INFO("TM_FUSE_SILU_ACT=1");
+        return true;
+    }();
+    return value;
+}
+
 template<typename T>
 LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                                     size_t     head_num,
@@ -91,9 +110,8 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             }
         }
     }
-    // fused_up_and_gate_ = weight_type_ == WeightType::kINT4 && ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
-    fused_up_and_gate_ = true && ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
+    fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
 
     self_attn_weights.qkv.input_dims  = hidden_units_;
     self_attn_weights.qkv.output_dims = (head_num + 2 * kv_head_num) * size_per_head / tensor_para_size_;
@@ -119,7 +137,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     ffn_weights.fused_gating_intermediate.output_dims = inter_size_ / tensor_para_size_ * 2;
     ffn_weights.fused_gating_intermediate.type        = weight_type;
     ffn_weights.fused_gating_intermediate.group_size  = group_size;
-    ffn_weights.is_fused_silu                         = weight_type == WeightType::kINT4;
+    ffn_weights.is_fused_silu                         = weight_type == WeightType::kINT4 && is_fuse_silu_act();
 
     ffn_weights.output.input_dims  = inter_size_ / tensor_para_size_;
     ffn_weights.output.output_dims = hidden_units_;

From 8ee8afbcdaf46a07fc4f26196de87d8ca3d8c95e Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 20 Aug 2024 17:43:03 +0800
Subject: [PATCH 26/39] Fix the way to get "quantization_config" from model's
 coniguration (#2325)

* fix getting quantization config

* recursive get quant_config

* minor fix

* check 128
---
 lmdeploy/cli/utils.py                  |  5 +-
 lmdeploy/turbomind/deploy/converter.py | 78 +++++++++++++++++++++++---
 lmdeploy/turbomind/turbomind.py        | 23 +-------
 3 files changed, 73 insertions(+), 33 deletions(-)

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index b44298a6d9..8dc25deb33 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -107,8 +107,9 @@ def model_format(parser, default: str = None):
             type=str,
             default=default,
             choices=['hf', 'llama', 'awq', 'gptq'],
-            help='The format of input model. `hf` meaning `hf_llama`, `llama` '
-            'meaning `meta_llama`, `awq` meaning the quantized model by awq')
+            help='The format of input model. `hf` means `hf_llama`, `llama` '
+            'means `meta_llama`, `awq` represents the quantized model by AWQ,'
+            ' and `gptq` refers to the quantized model by GPTQ')
 
     @staticmethod
     def revision(parser, default: str = None):
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 60c93c9047..e9b67a3b66 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -161,17 +161,78 @@ def pack_model_repository(workspace_path: str):
                dst=osp.join(model_repo_dir, 'postprocessing'))
 
 
+def find_quantization_config(nested, target_key):
+    if isinstance(nested, dict):
+        for key, value in nested.items():
+            if key == target_key:
+                return value
+            if isinstance(value, (dict, list)):
+                result = find_quantization_config(value, target_key)
+                if result is not None:
+                    return result
+    elif isinstance(nested, list):
+        for item in nested:
+            result = find_quantization_config(item, target_key)
+            if result is not None:
+                return result
+    return None
+
+
 def get_tm_model(model_path,
                  model_name,
                  chat_template_name,
-                 group_size,
                  engine_config,
+                 group_size: int = None,
                  out_dir: str = None):
-    # TODO: open the following condition check in another PR,
-    # CLI needs to be updated
-    # if model_format == 'awq' and group_size <= 0:
-    #     raise RuntimeError(
-    #         'group_size should be specified when the model is awq')
+    """Create turbomind model.
+
+    Args:
+        model_path (str): the path of the input model, which is supposed
+            to be a local path, or huggingface hub repo_id, or modelscope
+            hub repo_id
+        model_name (str): user customized model name
+        chat_template_name (str): the name of the chat template of
+            the input model
+        engine_config(TurbomindEngineConfig): user input engine config
+        group_size(int): refers to the group_size if the input model
+            is a w4a16(awq or gptq) quantized model
+        out_dir(str): the output directory where to save to turbomind model.
+            If it is None, the turbomind model won't be saved
+    """
+    _, cfg = get_model_arch(model_path)
+    quant_config = find_quantization_config(cfg.to_dict(),
+                                            'quantization_config')
+    if quant_config:
+        quant_method = quant_config.get('quant_method')
+        _group_size = int(quant_config.get('group_size', 0))
+        version = quant_config.get('version')
+        assert engine_config.model_format is None or \
+            engine_config.model_format == quant_method, \
+            f'mismatched quant method: user input ' \
+            f'"{engine_config.model_format}" ' \
+            f'vs model quant_config "{quant_method}"'
+        assert group_size is None or group_size == _group_size, \
+            f'mismatched quant group size: user input "{group_size}" ' \
+            f'vs model quant_config "{_group_size}"'
+
+        engine_config.model_format = quant_method
+        group_size = _group_size
+
+        if quant_method == 'awq':
+            assert version == 'gemm', \
+                f'unsupported quant config: {quant_config}'
+        elif quant_method == 'gptq':
+            assert not quant_config.get('desc_act', False) and \
+                quant_config.get('sym', True), \
+                f'unsupported quant config: {quant_config}'
+        else:
+            assert 0, f'unsupported quant_config: {quant_config}'
+
+    if engine_config.model_format in ['awq', 'gptq']:
+        assert group_size == 128, \
+            f'model format is "{engine_config.model_format}" ' \
+            f'but group_size is {group_size}. Currently, only 128 ' \
+            'is supported'
 
     input_model_name = get_input_model_registered_name(
         model_path, engine_config.model_format)
@@ -266,10 +327,9 @@ def main(model_name: str,
 
     tm_weight_path, tm_tokenizer_path = create_workspace(dst_path)
     copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path)
-
     engine_config = TurbomindEngineConfig(tp=tp, model_format=model_format)
-    tm_model = get_tm_model(model_path, model_name, chat_template, group_size,
-                            engine_config, tm_weight_path)
+    tm_model = get_tm_model(model_path, model_name, chat_template,
+                            engine_config, group_size, tm_weight_path)
     tm_model.export()
 
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 18317eee0d..ec5b064305 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -18,7 +18,6 @@
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger, get_model
 
-from ..archs import get_model_arch
 from .deploy.converter import SUPPORTED_FORMATS, get_tm_model
 from .deploy.target_model.base import TurbomindModelConfig
 from .supported_models import is_supported
@@ -176,33 +175,13 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
         assert engine_config.model_format in SUPPORTED_FORMATS, \
             f'The model format should be in {SUPPORTED_FORMATS}'
 
-        group_size = 0
-        if engine_config.model_format is None:
-            _, cfg = get_model_arch(model_path)
-            quant_config = getattr(cfg, 'quantization_config', None)
-            if quant_config:
-                quant_method = quant_config.get('quant_method')
-                group_size = int(quant_config.get('group_size', 0))
-                version = quant_config.get('version')
-                if quant_method == 'awq' and group_size == 128 and \
-                        version == 'gemm':
-                    engine_config.model_format = 'awq'
-                elif all((quant_method == 'gptq', group_size == 128,
-                          not quant_config.get('desc_act', False),
-                          quant_config.get('sym', True))):
-                    engine_config.model_format = 'gptq'
-                else:
-                    raise AssertionError(
-                        f'unsupported quant config: {quant_config}')
-
         assert is_supported(model_path), (
             f'turbomind does not support {model_path}. '
             'Plz try pytorch engine instead.')
 
         # convert transformers model into turbomind model format
         tm_model = get_tm_model(model_path, self.model_name,
-                                self.chat_template_name, group_size,
-                                engine_config)
+                                self.chat_template_name, engine_config)
 
         self.config = tm_model.cfg
         logger.info(f'model_config:\n\n{self.config.toini()}')

From 0e6e8970015a3f17c33db98fda7db2c888a82221 Mon Sep 17 00:00:00 2001
From: CyCle1024 <chenchiyu@pjlab.org.cn>
Date: Tue, 20 Aug 2024 17:43:45 +0800
Subject: [PATCH 27/39] fix(ascend): fix import error of pt engine in cli
 (#2328)

---
 lmdeploy/pytorch/check_env/__init__.py | 14 ++++++++++++++
 lmdeploy/pytorch/engine/model_agent.py |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 7cfe82b164..1f14ac92b9 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -22,6 +22,19 @@ def _handle_exception(e: Exception,
     exit(1)
 
 
+def check_env_deeplink(device_type: str):
+    """check Deeplink environment if specific device_type is set."""
+    deeplink_device_type_list = [
+        'ascend',
+    ]
+    if device_type in deeplink_device_type_list:
+        logger = get_logger('lmdeploy')
+        try:
+            import deeplink_ext  # noqa: F401
+        except Exception as e:
+            _handle_exception(e, 'PyTorch', logger)
+
+
 def check_env_torch():
     """check PyTorch environment."""
     logger = get_logger('lmdeploy')
@@ -78,6 +91,7 @@ def check_env(device_type: str):
     """check all environment."""
     logger = get_logger('lmdeploy')
     logger.info('Checking environment for PyTorch Engine.')
+    check_env_deeplink(device_type)
     check_env_torch()
     if device_type == 'cuda':
         check_env_triton()
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
index f1b4f22a2b..dbaf19b6f1 100644
--- a/lmdeploy/pytorch/engine/model_agent.py
+++ b/lmdeploy/pytorch/engine/model_agent.py
@@ -1003,6 +1003,8 @@ def _start_tp_process(proc_id: int,
     """
     rank = proc_id + 1
     try:
+        from lmdeploy.pytorch.check_env import check_env_deeplink
+        check_env_deeplink(device_context.device_type)
         dist.init_process_group('nccl',
                                 rank=rank,
                                 world_size=world_size,

From 0a1f65a6727c48d564395650b84446a8ce099402 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 20 Aug 2024 18:13:29 +0800
Subject: [PATCH 28/39] Use single thread per model instance (#2339)

---
 lmdeploy/turbomind/turbomind.py          | 57 ++++++-----------
 src/turbomind/models/llama/LlamaBatch.cc | 78 ++++++++++++++---------
 src/turbomind/models/llama/LlamaV2.cc    | 79 +++++++++---------------
 src/turbomind/models/llama/Request.h     |  4 +-
 4 files changed, 100 insertions(+), 118 deletions(-)

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index ec5b064305..1cd4ef47c9 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -318,26 +318,17 @@ def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
         self.nccl_params = tm_model.nccl_params
 
         # create model instances
-        model_insts = [None] * self.gpu_count
-        with ThreadPoolExecutor(max_workers=self.gpu_count) as executor:
-            futures = []
-            for device_id in range(self.gpu_count):
-                futures.append(
-                    executor.submit(self._create_model_instance, device_id,
-                                    model_insts))
-            for future in futures:
-                future.result()
+        self.model_inst = self._create_model_instance(0)
 
-        self.model_insts = model_insts
         self.que = Queue()
         self.executor: ThreadPoolExecutor = None
-        self.futures = [None] * self.gpu_count
+        self.future = None
 
-    def _create_model_instance(self, device_id, model_insts):
+    def _create_model_instance(self, device_id):
         rank = self.node_id * self.gpu_count + device_id
         model_inst = self.tm_model.model_comm.create_model_instance(
             device_id, rank, self.cuda_stream_id, self.nccl_params)
-        model_insts[device_id] = model_inst
+        return model_inst
 
     def _forward_callback(self, result, ctx):
         self.que.put((False, result))
@@ -346,15 +337,12 @@ def _forward_thread(self, inputs):
         instance_comm = self.tm_model.model_comm.create_instance_comm(
             self.gpu_count)
 
-        def _func(device_id, enque_output):
-            output = self.model_insts[device_id].forward(inputs, instance_comm)
-            if enque_output:
-                self.que.put((True, output))
+        def _func():
+            output = self.model_inst.forward(inputs, instance_comm)
+            self.que.put((True, output))
 
-        self.executor = ThreadPoolExecutor(self.gpu_count)
-        for device_id in range(self.gpu_count):
-            f = self.executor.submit(_func, device_id, device_id == 0)
-            self.futures[device_id] = f
+        self.executor = ThreadPoolExecutor(1)
+        self.future = self.executor.submit(_func)
 
     def _async_forward_callback(self, result, ctx, que: LifoQueue):
         que.put((False, result))
@@ -363,15 +351,12 @@ def _async_forward_thread(self, inputs, que: LifoQueue):
         instance_comm = self.tm_model.model_comm.create_instance_comm(
             self.gpu_count)
 
-        def _func(device_id, enque_output):
-            output = self.model_insts[device_id].forward(inputs, instance_comm)
-            if enque_output:
-                que.put((True, output))
+        def _func():
+            output = self.model_inst.forward(inputs, instance_comm)
+            que.put((True, output))
 
-        self.executor = ThreadPoolExecutor(self.gpu_count)
-        for device_id in range(self.gpu_count):
-            f = self.executor.submit(_func, device_id, device_id == 0)
-            self.futures[device_id] = f
+        self.executor = ThreadPoolExecutor(1)
+        self.future = self.executor.submit(_func)
 
     def _get_logprobs(self,
                       logprob_vals: torch.Tensor,
@@ -617,7 +602,7 @@ async def async_stream_infer(self,
         _forward_thread = partial(self._async_forward_thread, que=que)
         if stream_output and not stop:
             logger.info(f'Register stream callback for {session_id}')
-            self.model_insts[0].register_callback(_forward_callback)
+            self.model_inst.register_callback(_forward_callback)
 
         inputs, input_lengths = self.prepare_inputs(
             session_id=session_id,
@@ -691,14 +676,13 @@ async def async_stream_infer(self,
             yield outputs
 
             if finish:
-                for f in self.futures:
-                    f.result()
+                self.future.result()
                 self.executor.shutdown()
                 break
 
         if stream_output and not stop:
             logger.info(f'UN-register stream callback for {session_id}')
-            self.model_insts[0].unregister_callback()
+            self.model_inst.unregister_callback()
 
     def stream_infer(self,
                      session_id,
@@ -730,7 +714,7 @@ def stream_infer(self,
         """
         if stream_output and not stop:
             logger.info(f'Register stream callback for {session_id}')
-            self.model_insts[0].register_callback(self._forward_callback)
+            self.model_inst.register_callback(self._forward_callback)
 
         inputs, input_lengths = self.prepare_inputs(
             session_id=session_id,
@@ -803,8 +787,7 @@ def stream_infer(self,
             yield outputs
 
             if finish:
-                for f in self.futures:
-                    f.result()
+                self.future.result()
                 self.executor.shutdown()
                 while self.que.qsize() > 0:
                     self.que.get()
@@ -812,7 +795,7 @@ def stream_infer(self,
 
         if stream_output and not stop:
             logger.info(f'UN-register stream callback for {session_id}')
-            self.model_insts[0].unregister_callback()
+            self.model_inst.unregister_callback()
 
     def decode(self,
                input_ids,
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 8aafbe4b7b..987ac9e86f 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -102,9 +102,9 @@ void LlamaBatch<T>::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_r
             if (r) {
                 int ec = 0;
 
-                const int  input_length = r->inputs[rank_].getVal<int>("input_lengths", 0);
+                const int  input_length = r->inputs.getVal<int>("input_lengths", 0);
                 const auto get_offset   = [&](int token_count) {
-                    return std::max(0, std::min(token_count, r->inputs[rank_].getVal<int>("step", token_count)));
+                    return std::max(0, std::min(token_count, r->inputs.getVal<int>("step", token_count)));
                 };
 
                 if (occurrence[r->id] != 1) {
@@ -249,7 +249,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
 
         auto& seq = *state.sequences[idx];
 
-        if (int step = r->inputs[rank_].getVal<int>("step", -1); step >= 0) {
+        if (int step = r->inputs.getVal<int>("step", -1); step >= 0) {
             if (step <= seq.tokens.size()) {
                 seq.tokens.resize(step);
                 seq.cache_len = std::min(seq.cache_len, step);
@@ -261,8 +261,8 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
             }
         }
 
-        const int  input_length = r->inputs[rank_].getVal<int>("input_lengths");
-        const int* input_ids    = r->inputs[rank_].getPtr<int>("input_ids");
+        const int  input_length = r->inputs.getVal<int>("input_lengths");
+        const int* input_ids    = r->inputs.getPtr<int>("input_ids");
 
         {
             // `output_ids` contains all token ids of the sequences
@@ -285,16 +285,16 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
         }
 
         // copy input tokens to prompt for prefix matching
-        if (input_length && r->start_flag && !r->inputs[rank_].isExist("input_embedding_ranges")) {
+        if (input_length && r->start_flag && !r->inputs.isExist("input_embedding_ranges")) {
             // TODO: truncate prompt to enable prefix caching for VLM
             seq.prompt.resize(input_length);
             std::copy_n(input_ids, input_length, seq.prompt.data());
         }
 
         // copy input embeddings
-        if (r->inputs[rank_].isExist("input_embedding_ranges")) {
-            const auto range_tensor = r->inputs[rank_].at("input_embedding_ranges");
-            const auto emb_tensor   = r->inputs[rank_].at("input_embeddings");
+        if (r->inputs.isExist("input_embedding_ranges")) {
+            const auto range_tensor = r->inputs.at("input_embedding_ranges");
+            const auto emb_tensor   = r->inputs.at("input_embeddings");
             const int* ranges       = range_tensor.getPtr<int>();
 
             auto check_embeddings = [&](int& num_valid_embeddings) {
@@ -332,7 +332,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
                                range_tensor.toString().c_str());
             }
             else {
-                char* emb_tensor_ptr = emb_tensor.getPtr<char>();
+                const char* emb_tensor_ptr = emb_tensor.getPtr<char>();
                 for (size_t i = 0; i < num_valid_embeddings; i++) {
                     int    begin = ranges[i * 2];
                     int    end   = ranges[i * 2 + 1];
@@ -344,7 +344,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
             }
         }
 
-        const int request_output_len = state.requests[idx]->inputs[rank_].getVal<int>("request_output_len");
+        const int request_output_len = state.requests[idx]->inputs.getVal<int>("request_output_len");
         state.seq_len_limit[idx]     = state.h_context_length[idx] + request_output_len;
         // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
         // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
@@ -386,7 +386,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
 
         if (r->start_flag) {
             // prepare to initialize random state for new sequence
-            h_random_seed_[idx] = r->inputs[rank_].getVal<unsigned long long>("random_seed", 0);
+            h_random_seed_[idx] = r->inputs.getVal<unsigned long long>("random_seed", 0);
         }
         else {
             // Recover device states if not a new sequence
@@ -1045,8 +1045,8 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
         // find an exemplar that matches the param name
         const Tensor* ptr{};
         for (int i = 0; i < batch_size; ++i) {
-            if (state_->requests[i]->inputs[rank_].isExist(name)) {
-                ptr = &state_->requests[i]->inputs[rank_].at(name);
+            if (state_->requests[i]->inputs.isExist(name)) {
+                ptr = &state_->requests[i]->inputs.at(name);
                 break;
             }
         }
@@ -1061,8 +1061,8 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
             int max_list_length = 0;
             if (name == "bad_words_list" || name == "stop_words_list") {
                 for (int i = 0; i < batch_size; ++i) {
-                    if (state_->requests[i]->inputs[rank_].isExist(name)) {
-                        Tensor& src = state_->requests[i]->inputs[rank_].at(name);
+                    if (state_->requests[i]->inputs.isExist(name)) {
+                        Tensor& src = state_->requests[i]->inputs.at(name);
                         FT_CHECK(src.shape.size() == 3 && src.shape[1] == 2 && src.shape[2] <= kMaxStopBadWordsLen);
                         max_list_length = std::max(max_list_length, (int)src.shape[2]);
                     }
@@ -1075,8 +1075,8 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
             }
             for (int i = 0; i < batch_size; ++i) {
                 FT_CHECK(state_->requests[i] != nullptr);
-                if (state_->requests[i]->inputs[rank_].isExist(name)) {
-                    Tensor& src = state_->requests[i]->inputs[rank_].at(name);
+                if (state_->requests[i]->inputs.isExist(name)) {
+                    Tensor& src = state_->requests[i]->inputs.at(name);
                     if (name == "bad_words_list" || name == "stop_words_list") {
                         int list_length = src.shape[2];
                         std::copy_n(src.getPtr<std::byte>(),
@@ -1127,7 +1127,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
 
     TensorMap outputs;
     for (int i = 0; i < batch_size; i++) {
-        if (state_->requests[i]->inputs[rank_].isExist("logprobs")) {
+        if (state_->requests[i]->inputs.isExist("logprobs")) {
             outputs.insert(
                 {"sampled_logprobs", {MEMORY_GPU, TYPE_FP32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}});
             outputs.insert(
@@ -1153,7 +1153,7 @@ void LlamaBatch<T>::OutputContextLogits(T*                                  cont
         bool is_return_logits = false;
         for (int k = 0; k < indices.size(); ++k) {
             auto& request = state_->requests[indices[k]];
-            auto  logits  = request->outputs[rank_].getPtr<float>("logits", nullptr);
+            auto  logits  = request->outputs.getPtr<float>("logits", nullptr);
             if (logits && sequences[k]->cache_len + lengths[k] <= sequences[k]->tokens.size()) {
                 logits = nullptr;
             }
@@ -1185,6 +1185,11 @@ void LlamaBatch<T>::OutputContextLogits(T*                                  cont
 
     auto logits = context_logits_buf_;
 
+    // Only rank-0 writes to output
+    if (rank_ != 0) {
+        return;
+    }
+
     for (int k = 0; k < indices.size(); ++k) {
         if (output_logits[k]) {
             auto src_ptr       = logits;
@@ -1250,15 +1255,17 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
         ++state_->h_context_length[i];
     }
 
-    {  // output logprobs, should be set before sequence_length
+    // ! Only rank-0 writes to output
+    if (rank_ == 0) {
+        // output logprobs, should be set before sequence_length
         float*    sampled_logprobs_ptr = h_sampled_logprobs_;
         uint32_t* sampled_indexes_ptr  = h_sampled_indexes_;
         uint32_t* sampled_nums_ptr     = h_sampled_nums_;
         for (int i = 0; i < batch_size - g.partial; ++i) {
-            if (state_->requests[i] && state_->requests[i]->inputs[rank_].isExist("logprobs")) {
-                auto logprob_vals    = state_->requests[i]->outputs[rank_].getPtr<float>("logprob_vals");
-                auto logprob_indexes = state_->requests[i]->outputs[rank_].getPtr<uint32_t>("logprob_indexes");
-                auto logprob_nums    = state_->requests[i]->outputs[rank_].getPtr<uint32_t>("logprob_nums");
+            if (state_->requests[i] && state_->requests[i]->inputs.isExist("logprobs")) {
+                auto logprob_vals    = state_->requests[i]->outputs.getPtr<float>("logprob_vals");
+                auto logprob_indexes = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_indexes");
+                auto logprob_nums    = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_nums");
 
                 int offset = state_->h_context_length[i] - state_->h_prompt_length[i] - 1;
                 std::copy(sampled_logprobs_ptr,
@@ -1275,12 +1282,14 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
         }
     }
 
-    {  // set output tokens ids and sequence length
+    // ! Only rank-0 writes to output
+    if (rank_ == 0) {
+        // set output tokens ids and sequence length
         int* output_ptr = h_output_ids_;
         for (int i = 0; i < batch_size - g.partial; ++i) {
             if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) {
-                auto      output_ids = state_->requests[i]->outputs[rank_].getPtr<int>("output_ids");
-                auto      output_len = state_->requests[i]->outputs[rank_].getPtr<int>("sequence_length");
+                auto      output_ids = state_->requests[i]->outputs.getPtr<int>("output_ids");
+                auto      output_len = state_->requests[i]->outputs.getPtr<int>("sequence_length");
                 const int count      = state_->h_context_length[i];
                 // TODO: sync history output tokens at when receiving the request and copy the last token here
                 std::copy(output_ptr, output_ptr + count, output_ids);
@@ -1322,7 +1331,7 @@ auto LlamaBatch<T>::Finish(GenerationState& g) -> std::vector<Signal>
                     signals.push_back([this, r = state_->requests[i]] {
                         if (rank_ == 0) {
                             try {
-                                r->stream_cb(&r->outputs[rank_].get());
+                                r->stream_cb(&r->outputs.get());
                             }
                             catch (const std::bad_function_call& e) {
                                 TM_LOG_ERROR("Null stream callback for (%s)", std::to_string(r->id).c_str());
@@ -1379,7 +1388,16 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 
         // Update token IDs
         seq.tokens.resize(output_len);
-        const auto output_ids_data = state_->requests[index]->outputs[rank_].at("output_ids").getPtr<int>();
+        const auto output_ids_data = [&] {
+            if (force_stop) {
+                // `h_output_ids_` is UNDEFINED at `ProcessStopRequests`
+                return state_->requests[index]->outputs.at("output_ids").getPtr<int>();
+            }
+            else {
+                // `h_output_ids_` just updated by `Finish`, but `outputs` is NOT synced atm
+                return h_output_ids_ + index * (size_t)session_len_;
+            }
+        }();
         std::copy_n(output_ids_data, output_len, seq.tokens.data());
 
         // Save random state in host memory
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index f9f7922ff6..55559acb05 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -479,79 +479,60 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
     const int batch_size = outputs->at("output_ids").shape[0];
 
     const auto rank = tensor_para_.rank_;
+    FT_CHECK(rank == 0);
 
     std::vector<std::shared_ptr<Request>> requests(batch_size);
 
-    // rank-0 allocates all requests for the batch
-    if (rank == 0) {
-        for (int i = 0; i < batch_size; ++i) {
-            requests[i] = std::make_shared<Request>();
-            requests[i]->inputs.resize(tensor_para_.world_size_);
-            requests[i]->outputs.resize(tensor_para_.world_size_);
-        }
-        control.comm->setSharedObject(&requests);
-    }
-
-    control.comm->barrier();
-
-    if (rank != 0) {
-        requests = *(std::vector<std::shared_ptr<Request>>*)control.comm->getSharedObject();
+    // allocates all requests for the batch
+    for (int i = 0; i < batch_size; ++i) {
+        requests[i] = std::make_shared<Request>();
     }
 
     for (int i = 0; i < batch_size; ++i) {
         auto& r = requests[i];
 
-        r->inputs[rank]  = slice(*inputs, i);
-        r->outputs[rank] = slice(*outputs, i);
+        r->inputs  = slice(*inputs, i);
+        r->outputs = slice(*outputs, i);
 
         if (rank == 0) {
-            r->id         = r->inputs[rank].getVal<uint64_t>("CORRID", i);
-            r->start_flag = r->inputs[rank].getVal<int>("START", 1);
-            r->end_flag   = r->inputs[rank].getVal<int>("END", 1);
-            r->stop_flag  = r->inputs[rank].getVal<int>("STOP", 0);
+            r->id         = r->inputs.getVal<uint64_t>("CORRID", i);
+            r->start_flag = r->inputs.getVal<int>("START", 1);
+            r->end_flag   = r->inputs.getVal<int>("END", 1);
+            r->stop_flag  = r->inputs.getVal<int>("STOP", 0);
             r->stream_cb  = control.callback;
         }
     }
 
-    control.comm->barrier();
-
-    // rank-0 now takes the ownership of `requests`
-    // rank-0 submits the tasks and wait for finish
+    // Submits the tasks and wait for finish
     std::vector<int> error_codes;
     bool             has_error = 0;
-    if (rank == 0) {
-        TM_LOG_INFO("[forward] Enqueue requests");
 
-        std::vector<uint64_t> ids;
-        for (const auto& r : requests) {
-            ids.push_back(r->id);
-        }
+    TM_LOG_INFO("[forward] Enqueue requests");
 
-        auto futures = shared_state_->request_queue.enqueue(std::move(requests));
+    std::vector<uint64_t> ids;
+    for (const auto& r : requests) {
+        ids.push_back(r->id);
+    }
 
-        FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed");
+    auto futures = shared_state_->request_queue.enqueue(std::move(requests));
 
-        TM_LOG_INFO("[forward] Wait for requests to complete ...");
+    FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed");
 
-        for (int i = 0; i < futures.size(); ++i) {
-            auto ec = futures[i].get();
-            error_codes.push_back(ec);
-            if (ec) {
-                has_error = true;
-            }
-            if (!ec) {
-                TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]);
-            }
-            else {
-                TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec);
-            }
+    TM_LOG_INFO("[forward] Wait for requests to complete ...");
+
+    for (int i = 0; i < futures.size(); ++i) {
+        auto ec = futures[i].get();
+        error_codes.push_back(ec);
+        if (ec) {
+            has_error = true;
+            TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec);
+        }
+        else {
+            TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]);
         }
     }
 
-    // prevents request tensors being freed before the batch completes
-    control.comm->barrier();
-
-    if (rank == 0 && has_error) {
+    if (has_error) {
         std::stringstream ss;
         for (int i = 0; i < error_codes.size(); ++i) {
             ss << (i ? "" : " ") << error_codes[i];
diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h
index 324162a571..2a715e9c9c 100644
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -21,8 +21,8 @@ struct Request {
     bool stop_flag;
 
     // per rank inputs/outputs
-    std::vector<TensorMap> inputs;
-    std::vector<TensorMap> outputs;
+    TensorMap inputs;
+    TensorMap outputs;
 
     using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
     Callback stream_cb;

From f21b5a4003cf57214c76c4c585263f06015d5655 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Tue, 20 Aug 2024 18:15:18 +0800
Subject: [PATCH 29/39] add cache to speed up docker building (#2344)

---
 docker/Dockerfile | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 677597f85c..8caa2ed966 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,10 +21,6 @@ RUN apt-get update -y && apt-get install -y software-properties-common wget vim
 
 ENV PATH=/opt/py3/bin:$PATH
 
-RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools==69.5.1 &&\
-    python3 -m pip install --no-cache-dir torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT} &&\
-    python3 -m pip install --no-cache-dir cmake packaging wheel
-
 # install openmpi
 RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz &&\
     tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi &&\
@@ -33,6 +29,10 @@ RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.g
 ENV PATH=$PATH:/usr/local/openmpi/bin
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
 
+RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --upgrade pip setuptools==69.5.1 &&\
+    python3 -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT} &&\
+    python3 -m pip install cmake packaging wheel
+
 ENV NCCL_LAUNCH_MODE=GROUP
 
 # Should be in the lmdeploy root directory when building docker image
@@ -40,15 +40,14 @@ COPY . /opt/lmdeploy
 
 WORKDIR /opt/lmdeploy
 
-RUN cd /opt/lmdeploy &&\
-    python3 -m pip install --no-cache-dir -r requirements.txt &&\
+RUN --mount=type=cache,target=/root/.cache/pip cd /opt/lmdeploy &&\
+    python3 -m pip install -r requirements.txt &&\
     mkdir -p build && cd build &&\
     sh ../generate.sh &&\
     ninja -j$(nproc) && ninja install &&\
     cd .. &&\
     python3 -m pip install -e . &&\
-    rm -rf build  &&\
-    rm -rf ~/.cache/*
+    rm -rf build
 
 ENV LD_LIBRARY_PATH=/opt/lmdeploy/install/lib:$LD_LIBRARY_PATH
 ENV PATH=/opt/lmdeploy/install/bin:$PATH

From adbefe9a4fb66bab2faa00eb6c04f2e3d480eea0 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 20 Aug 2024 18:16:25 +0800
Subject: [PATCH 30/39] add max_prefill_token_num argument in CLI (#2345)

---
 lmdeploy/cli/serve.py | 18 ++++++++++++------
 lmdeploy/cli/utils.py |  8 ++++++++
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index c797802127..1162be6c58 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -64,7 +64,8 @@ def add_parser_gradio():
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
-
+        max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(
+            pt_group)
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
@@ -74,6 +75,7 @@ def add_parser_gradio():
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(cache_block_seq_len_act)
         tb_group._group_actions.append(prefix_caching_act)
+        tb_group._group_actions.append(max_prefill_token_num_act)
         ArgumentHelper.model_format(tb_group)
         ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
@@ -152,7 +154,8 @@ def add_parser_api_server():
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
-
+        max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(
+            pt_group)
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
@@ -162,6 +165,7 @@ def add_parser_api_server():
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(cache_block_seq_len_act)
         tb_group._group_actions.append(prefix_caching_act)
+        tb_group._group_actions.append(max_prefill_token_num_act)
         ArgumentHelper.model_format(tb_group)
         ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
@@ -211,7 +215,8 @@ def gradio(args):
                 block_size=args.cache_block_seq_len,
                 session_len=args.session_len,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device)
+                device_type=args.device,
+                max_prefill_token_num=args.max_prefill_token_num)
         else:
             backend_config = TurbomindEngineConfig(
                 tp=args.tp,
@@ -223,7 +228,7 @@ def gradio(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 cache_block_seq_len=args.cache_block_seq_len,
                 enable_prefix_caching=args.enable_prefix_caching,
-            )
+                max_prefill_token_num=args.max_prefill_token_num)
         chat_template_config = get_chat_template(args.chat_template)
         run(args.model_path_or_server,
             server_name=args.server_name,
@@ -254,7 +259,8 @@ def api_server(args):
                 session_len=args.session_len,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device)
+                device_type=args.device,
+                max_prefill_token_num=args.max_prefill_token_num)
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(
@@ -267,7 +273,7 @@ def api_server(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 cache_block_seq_len=args.cache_block_seq_len,
                 enable_prefix_caching=args.enable_prefix_caching,
-            )
+                max_prefill_token_num=args.max_prefill_token_num)
         chat_template_config = get_chat_template(args.chat_template)
 
         from lmdeploy.messages import VisionConfig
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 8dc25deb33..e1f5c41ab2 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -449,6 +449,14 @@ def max_prefill_iters(parser):
             default=1,
             help='the max number of forward passes in prefill stage')
 
+    @staticmethod
+    def max_prefill_token_num(parser):
+        return parser.add_argument(
+            '--max-prefill-token-num',
+            type=int,
+            default=8192,
+            help='the max number of tokens per iteration during prefill')
+
     @staticmethod
     def vision_max_batch_size(parser):
         return parser.add_argument('--vision-max-batch-size',

From 0f5712823d32087acea1c190cd576e1db7f1a545 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 21 Aug 2024 12:46:48 +0800
Subject: [PATCH 31/39] Fix hidden size and support mistral nemo (#2215)

* Fix hidden size and support mistral nemo

* fix lint

* comments
---
 lmdeploy/turbomind/deploy/source_model/deepseek_vl.py  | 4 +++-
 lmdeploy/turbomind/deploy/source_model/glm4.py         | 4 +++-
 lmdeploy/turbomind/deploy/source_model/internvl.py     | 4 +++-
 lmdeploy/turbomind/deploy/source_model/llama.py        | 4 +++-
 lmdeploy/turbomind/deploy/source_model/qwen.py         | 4 ++++
 lmdeploy/turbomind/deploy/target_model/base.py         | 6 +++---
 src/turbomind/models/llama/LlamaDecoderLayerWeight.cc  | 5 +++--
 src/turbomind/models/llama/LlamaDecoderLayerWeight.h   | 1 +
 src/turbomind/models/llama/LlamaFfnLayer.h             | 3 ++-
 src/turbomind/models/llama/LlamaV2.cc                  | 4 +++-
 src/turbomind/models/llama/LlamaV2.h                   | 1 +
 src/turbomind/models/llama/LlamaWeight.cc              | 4 +++-
 src/turbomind/models/llama/LlamaWeight.h               | 1 +
 src/turbomind/models/llama/unified_attention_layer.h   | 3 ++-
 src/turbomind/models/llama/unified_decoder.cc          | 2 ++
 src/turbomind/models/llama/unified_decoder.h           | 3 ++-
 src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 3 +++
 src/turbomind/triton_backend/llama/LlamaTritonModel.h  | 1 +
 18 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
index f17a3398b3..2b60454767 100644
--- a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
+++ b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -47,6 +47,7 @@ def model_info(self):
                     'language_config'].get('model_type', None) == 'llama':
                 model_arg = model_arg['language_config']  # depseek-vl
             num_layer = model_arg['num_hidden_layers']
+            hidden_units = model_arg['hidden_size']
             norm_eps = model_arg.get('rms_norm_eps', 1e-06)
             attn_head_num = model_arg.get('num_attention_heads', 32)
             if 'num_key_value_heads' in model_arg:
@@ -67,8 +68,9 @@ def model_info(self):
 
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
-                    attn_head_num=attn_head_num,
+                    head_num=attn_head_num,
                     kv_head_num=kv_head_num,
+                    hidden_units=hidden_units,
                     rope_theta=rope_theta,
                     max_position_embeddings=max_position_embeddings,
                     use_dynamic_ntk=use_dynamic_ntk,
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py
index 2c69d5d0da..1c26e0649a 100644
--- a/lmdeploy/turbomind/deploy/source_model/glm4.py
+++ b/lmdeploy/turbomind/deploy/source_model/glm4.py
@@ -85,6 +85,7 @@ def tokenizer_info(self):
     def model_info(self):
         """Read model info."""
         config = self.config
+        hidden_units = config.get('hidden_size', None)
         num_layer = config.get('num_hidden_layers', None)
         num_layer = config.get('num_layers', num_layer)
         norm_eps = config['layernorm_epsilon']
@@ -98,8 +99,9 @@ def model_info(self):
         seq_length = config['seq_length']
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
-                    attn_head_num=attn_head_num,
+                    head_num=attn_head_num,
                     kv_head_num=kv_head_num,
+                    hidden_units=hidden_units,
                     rope_theta=rope_theta,
                     max_position_embeddings=seq_length,
                     rotary_embedding=64,
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
index d7f446da93..83161adb15 100644
--- a/lmdeploy/turbomind/deploy/source_model/internvl.py
+++ b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -61,6 +61,7 @@ def model_info(self):
             model_arg = json.load(f)['llm_config']
             num_layer = model_arg['num_hidden_layers']
             norm_eps = model_arg['rms_norm_eps']
+            hidden_units = model_arg['hidden_size']
             attn_head_num = model_arg['num_attention_heads']
             if 'num_key_value_heads' in model_arg:
                 kv_head_num = model_arg['num_key_value_heads']
@@ -80,7 +81,8 @@ def model_info(self):
 
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
-                    attn_head_num=attn_head_num,
+                    hidden_units=hidden_units,
+                    head_num=attn_head_num,
                     kv_head_num=kv_head_num,
                     rope_theta=rope_theta,
                     max_position_embeddings=max_position_embeddings,
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
index fb94854a45..a67e3ee4e4 100644
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -207,6 +207,7 @@ def model_info(self):
                 kv_head_num = model_arg['num_key_value_heads']
             else:
                 kv_head_num = model_arg['num_attention_heads']
+            hidden_units = model_arg['hidden_size']
             rope_theta = float(model_arg.get('rope_theta', 10000.0))
             max_position_embeddings = int(
                 model_arg.get('max_position_embeddings', 0))
@@ -239,8 +240,9 @@ def model_info(self):
         return dict(
             num_layer=num_layer,
             norm_eps=norm_eps,
-            attn_head_num=attn_head_num,
+            head_num=attn_head_num,
             kv_head_num=kv_head_num,
+            hidden_units=hidden_units,
             rope_theta=rope_theta,
             max_position_embeddings=max_position_embeddings,
             original_max_position_embeddings=original_max_position_embeddings,
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
index 311f8e0a85..4e87057e62 100644
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -65,6 +65,7 @@ def model_info(self):
         params_path = osp.join(self.model_path, 'config.json')
         with open(params_path) as f:
             config = json.load(f)
+            hidden_units = config['hidden_size']
             num_layer = config['num_hidden_layers']
             norm_eps = config['layer_norm_epsilon']
             rope_theta = float(config.get('rotary_emb_base', 10000.0))
@@ -72,11 +73,14 @@ def model_info(self):
                 kv_head_num = config['num_key_value_heads']
             else:
                 kv_head_num = config['num_attention_heads']
+            attn_head_num = config['num_attention_heads']
             seq_length = config['seq_length']
             use_dynamic_ntk = int(config['use_dynamic_ntk'])
             use_logn_attn = int(config['use_logn_attn'])
         return dict(num_layer=num_layer,
                     norm_eps=norm_eps,
+                    hidden_units=hidden_units,
+                    head_num=attn_head_num,
                     kv_head_num=kv_head_num,
                     rope_theta=rope_theta,
                     max_position_embeddings=seq_length,
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index f969055759..87983b2551 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -41,6 +41,7 @@ class TurbomindModelConfig:
     tensor_para_size: int = None
     head_num: int = None
     kv_head_num: int = None
+    hidden_units: int = None
     vocab_size: int = None
     num_layer: int = None
     inter_size: int = None
@@ -190,14 +191,13 @@ def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
         final_cfg.update(self.input_model.model_info())
 
-        # head_num, vocab_size
+        # vocab_size
         for bin in self.input_model.bins():
             emb = bin.tok_embeddings()
             if emb is not None:
                 _vocab_size, dim = emb.shape
-                head_num = dim // cfg.size_per_head
                 break
-        final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size))
+        final_cfg.update(dict(vocab_size=_vocab_size))
         return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
 
     def export_config(self) -> None:
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 701d8a9a03..68a2cf5ae1 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -57,6 +57,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                                     size_t     head_num,
                                                     size_t     kv_head_num,
                                                     size_t     size_per_head,
+                                                    size_t     hidden_units,
                                                     size_t     inter_size,
                                                     WeightType weight_type,
                                                     int        group_size,
@@ -67,7 +68,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     head_num_(head_num),
     kv_head_num_(kv_head_num),
     size_per_head_(size_per_head),
-    hidden_units_(head_num * size_per_head),
+    hidden_units_(hidden_units),
     inter_size_(inter_size),
     weight_type_(weight_type),
     attn_bias_(attn_bias),
@@ -118,7 +119,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     self_attn_weights.qkv.type        = weight_type;
     self_attn_weights.qkv.group_size  = group_size;
 
-    self_attn_weights.output.input_dims  = hidden_units_ / tensor_para_size_;
+    self_attn_weights.output.input_dims  = (head_num * size_per_head) / tensor_para_size_;
     self_attn_weights.output.output_dims = hidden_units_;
     self_attn_weights.output.type        = weight_type;
     self_attn_weights.output.group_size  = group_size;
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 05600d0f56..07bc65cc5c 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -34,6 +34,7 @@ struct LlamaDecoderLayerWeight {
                             size_t     head_num,
                             size_t     kv_head_num,
                             size_t     size_per_head,
+                            size_t     hidden_units,
                             size_t     inter_size,
                             WeightType weight_type,
                             int        group_size,
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index 97465ad6d1..db5a94380c 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -33,6 +33,7 @@ class LlamaFfnLayer {
 public:
     LlamaFfnLayer(size_t         head_num,
                   size_t         size_per_head,
+                  size_t         hidden_units,
                   size_t         inter_size,
                   NcclParam      tensor_para,
                   cudaStream_t   stream,
@@ -42,7 +43,7 @@ class LlamaFfnLayer {
         head_num_(head_num),
         size_per_head_(size_per_head),
         inter_size_(inter_size / tensor_para.world_size_),
-        hidden_units_(head_num * size_per_head),
+        hidden_units_(hidden_units),
         stream_(stream),
         linear_(linear),
         allocator_(allocator),
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 55559acb05..eab5a0cea6 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -54,6 +54,7 @@ template<typename T>
 LlamaV2<T>::LlamaV2(size_t                       head_num,
                     size_t                       kv_head_num,
                     size_t                       size_per_head,
+                    size_t                       hidden_units,
                     size_t                       inter_size,
                     size_t                       num_layer,
                     size_t                       vocab_size,
@@ -85,7 +86,7 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
     rmsnorm_eps_(norm_eps),
     start_id_(start_id),
     end_id_(end_id),
-    hidden_units_(head_num * size_per_head),
+    hidden_units_(hidden_units),
     local_head_num_(head_num / tensor_para.world_size_),
     local_kv_head_num_(kv_head_num / tensor_para.world_size_),
     weights_(weights),
@@ -137,6 +138,7 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
     unified_decoder_.reset(new UnifiedDecoder<T>(head_num_,
                                                  kv_head_num,
                                                  size_per_head_,
+                                                 hidden_units_,
                                                  inter_size_,
                                                  num_layer_,
                                                  attn_params,
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index adf6c4f9d4..b0a19f4239 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -57,6 +57,7 @@ class LlamaV2 {
     LlamaV2(size_t                       head_num,
             size_t                       kv_head_num,
             size_t                       size_per_head,
+            size_t                       hidden_units,
             size_t                       inter_size,
             size_t                       num_layer,
             size_t                       vocab_size,
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 507f1a6f32..18ecc2507d 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -28,6 +28,7 @@ template<typename T>
 LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             size_t     kv_head_num,
                             size_t     size_per_head,
+                            size_t     hidden_units,
                             size_t     inter_size,
                             size_t     vocab_size,
                             size_t     num_layer,
@@ -37,7 +38,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             LoraParams lora_params,
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank):
-    hidden_units_(head_num * size_per_head),
+    hidden_units_(hidden_units),
     inter_size_(inter_size),
     vocab_size_(vocab_size),
     vocab_size_padded_(vocab_size),
@@ -56,6 +57,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                                                                        head_num,
                                                                        kv_head_num,
                                                                        size_per_head,
+                                                                       hidden_units_,
                                                                        inter_size_,
                                                                        weight_type_,
                                                                        group_size,
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index a180204ae2..f71e03715a 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -32,6 +32,7 @@ struct LlamaWeight {
     LlamaWeight(size_t     head_num,
                 size_t     kv_head_num,
                 size_t     size_per_head,
+                size_t     hidden_units,
                 size_t     inter_size,
                 size_t     vocab_size,
                 size_t     num_layer,
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index f632830e5f..58bba45896 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -53,6 +53,7 @@ class UnifiedAttentionLayer {
     UnifiedAttentionLayer(size_t               head_num,
                           size_t               kv_head_num,
                           size_t               size_per_head,
+                          size_t               hidden_units,
                           LlamaAttentionParams attn_params,
                           NcclParam            tensor_para,
                           LoraParams           lora_params,
@@ -64,7 +65,7 @@ class UnifiedAttentionLayer {
                           int                  quant_policy):
         head_num_(head_num),
         size_per_head_(size_per_head),
-        hidden_units_(head_num * size_per_head),
+        hidden_units_(hidden_units),
         local_head_num_(head_num / tensor_para.world_size_),
         local_kv_head_num_(kv_head_num / tensor_para.world_size_),
         head_n_rep_(head_num / kv_head_num),
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index e29d42680d..db9482fb48 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -39,6 +39,7 @@ void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
     attn_layer_ = new UnifiedAttentionLayer<T>(head_num_,
                                                kv_head_num,
                                                size_per_head_,
+                                               hidden_units_,
                                                attn_params,
                                                tensor_para_,
                                                lora_params_,
@@ -51,6 +52,7 @@ void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
 
     ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
                                       size_per_head_,
+                                      hidden_units_,
                                       inter_size_,
                                       tensor_para_,
                                       stream_,
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 0a80b415d5..b2acbe1b44 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -63,6 +63,7 @@ class UnifiedDecoder {
     UnifiedDecoder(size_t                      head_num,
                    size_t                      kv_head_num,
                    size_t                      size_per_head,
+                   size_t                      hidden_units,
                    size_t                      inter_size,
                    size_t                      num_layer,
                    const LlamaAttentionParams& attn_params,
@@ -84,7 +85,7 @@ class UnifiedDecoder {
         head_num_(head_num),
         size_per_head_(size_per_head),
         inter_size_(inter_size),
-        hidden_units_(head_num * size_per_head),
+        hidden_units_(hidden_units),
         num_layer_(num_layer),
         rmsnorm_eps_(rmsnorm_eps),
         tensor_para_(tensor_para),
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index d025935bf7..cb9ea29f48 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -199,6 +199,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     model_name_          = reader.Get("llama", "model_name");
     head_num_            = reader.GetInteger("llama", "head_num");
     kv_head_num_         = reader.GetInteger("llama", "kv_head_num", 0);
+    hidden_units_        = reader.GetInteger("llama", "hidden_units");
     size_per_head_       = reader.GetInteger("llama", "size_per_head");
     inter_size_          = reader.GetInteger("llama", "inter_size");
     num_layer_           = reader.GetInteger("llama", "num_layer");
@@ -338,6 +339,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
     auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
                                                   kv_head_num_,
                                                   size_per_head_,
+                                                  hidden_units_,
                                                   inter_size_,
                                                   num_layer_,
                                                   vocab_size_,
@@ -401,6 +403,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     shared_weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(head_num_,
                                                                       kv_head_num_,
                                                                       size_per_head_,
+                                                                      hidden_units_,
                                                                       inter_size_,
                                                                       vocab_size_,
                                                                       num_layer_,
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index fc7cfca0f2..02736e0f23 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -91,6 +91,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     size_t                          head_num_;
     size_t                          kv_head_num_;
+    size_t                          hidden_units_;
     size_t                          size_per_head_;
     size_t                          inter_size_;
     size_t                          num_layer_;

From 1c5b1e0c44e8d4f61988699f6b445a087d225a27 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 22 Aug 2024 13:40:06 +0800
Subject: [PATCH 32/39] Support custom logits processors (#2329)

* Support custom logits processors

* support logit_bias for pytorch engine

* mv to fused logits_processors

* replace input_ids with all_ids

* type hint
---
 lmdeploy/messages.py                      | 12 ++++-
 lmdeploy/pytorch/engine/engine.py         | 37 +++++++--------
 lmdeploy/pytorch/engine/logits_process.py | 31 ++++++++++--
 lmdeploy/pytorch/messages.py              |  8 ++--
 lmdeploy/serve/openai/api_server.py       | 57 ++++++++++++++++++++---
 lmdeploy/serve/openai/protocol.py         |  1 +
 6 files changed, 111 insertions(+), 35 deletions(-)

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index c3e37f0f3f..865c2249de 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -1,12 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
 from dataclasses import dataclass, field
-from typing import Dict, List, Literal, Optional
+from typing import Callable, Dict, List, Literal, Optional
 
+import torch
 from pydantic.dataclasses import dataclass as pydantic_dataclass
 
 from .tokenizer import Tokenizer
 
+LogitsProcessor = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+"""LogitsProcessor is a function that takes a tensor of input_ids, the logits
+tensor for the next token, and returns a modified tensor of logits
+to sample from."""
+
 
 @dataclass
 class GenerationConfig:
@@ -50,6 +56,7 @@ class GenerationConfig:
     min_new_tokens: int = None
     skip_special_tokens: bool = True
     logprobs: int = None
+    logits_processors: Optional[List[LogitsProcessor]] = None
 
 
 @dataclass
@@ -99,7 +106,8 @@ def special_word_token_ids(words):
             random_seed=gen_config.random_seed,
             skip_special_tokens=gen_config.skip_special_tokens,
             stop_words=special_word_token_ids(gen_config.stop_words),
-            bad_words=special_word_token_ids(gen_config.bad_words))
+            bad_words=special_word_token_ids(gen_config.bad_words),
+            logits_processors=gen_config.logits_processors)
 
     def __post_init__(self):
         """Check input validation."""
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 5758ea08c8..7b5d948396 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -478,7 +478,7 @@ def _batch_stopping_criteria(self, token_ids: torch.Tensor,
 
     @logging_timer('SamplingLogits', logger)
     def async_sampling_logits(self, logits: torch.Tensor,
-                              history_ids: torch.Tensor,
+                              all_ids: torch.Tensor,
                               sampling_inputs: SamplingInputs,
                               inputs: ModelInputs, ignore_eos: torch.Tensor):
         """sampling logits."""
@@ -494,7 +494,7 @@ def __get_last_logits():
 
         split_logits = __get_last_logits().cuda()
         logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos)
-        logits = logits_processor(history_ids, split_logits)
+        logits = logits_processor(split_logits, all_ids)
         next_token_ids = logits_processor.sampling(logits)
 
         return next_token_ids
@@ -646,19 +646,18 @@ def __get_q_start_loc():
 
     async def _async_step_background(
             self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict,
-            history_ids: torch.Tensor, sampling_inputs: SamplingInputs,
+            all_ids: torch.Tensor, sampling_inputs: SamplingInputs,
             num_appendable_ids: torch.LongTensor,
             num_ignore_eos: torch.LongTensor, output_que: asyncio.Queue):
         """asyc forward task."""
 
         def __update_inputs(next_token_ids):
             """update inputs."""
-            nonlocal history_ids
+            nonlocal all_ids
             inputs.update(next_token_ids)
-            if history_ids is not None:
-                history_ids = torch.cat([
-                    history_ids, next_token_ids[:, None].to(history_ids.device)
-                ], 1)
+            if all_ids is not None:
+                all_ids = torch.cat(
+                    [all_ids, next_token_ids[:, None].to(all_ids.device)], 1)
             if sampling_inputs.random_offsets is not None:
                 sampling_inputs.random_offsets += 1
 
@@ -666,8 +665,8 @@ def __update_inputs(next_token_ids):
                      f'batch_size={inputs.seq_length.size(0)} '
                      f'num_tokens={inputs.input_ids.size(-1)}')
         is_decoding = inputs.is_decoding
-        if history_ids is not None:
-            history_ids = history_ids.cuda()
+        if all_ids is not None:
+            all_ids = all_ids.cuda()
         sampling_inputs = sampling_inputs.to_device('cuda')
         num_appendable_ids = num_appendable_ids.cuda()
         num_ignore_eos = num_ignore_eos.cuda()
@@ -685,8 +684,7 @@ def __update_inputs(next_token_ids):
 
             # sampling
             next_token_ids = self.async_sampling_logits(
-                logits, history_ids, sampling_inputs, inputs,
-                num_ignore_eos > 0)
+                logits, all_ids, sampling_inputs, inputs, num_ignore_eos > 0)
             num_ignore_eos = num_ignore_eos - 1
 
             # stopping criteria
@@ -713,20 +711,21 @@ async def _async_loop_background(self, in_que: asyncio.Queue,
                                      out_que: asyncio.Queue):
         """async loop background."""
 
-        def __gather_history(seqs: SeqList, sampling_inputs: SamplingInputs):
+        def __gather_all_ids(seqs: SeqList, sampling_inputs: SamplingInputs):
             """gather history."""
-            if sampling_inputs.repetition_penalty is None:
+            if sampling_inputs.repetition_penalty is None and not any(
+                    sampling_inputs.logits_processors):
                 return None
             batch = len(seqs)
-            max_len = max(seq.history_len for seq in seqs)
+            max_len = max(seq.num_all_ids for seq in seqs)
             pad_id = self.model_config.bos_token_id
             pad_id = 0 if pad_id is None else pad_id
             output = torch.full((batch, max_len), pad_id, dtype=torch.int64)
             for idx, seq in enumerate(seqs):
-                h_len = seq.history_len
+                h_len = seq.num_all_ids
                 if h_len == 0:
                     continue
-                h_ids = torch.from_numpy(seq.history_ids)
+                h_ids = torch.from_numpy(seq.all_ids)
                 output[idx, -h_len:] = h_ids
             return output
 
@@ -761,7 +760,7 @@ def __get_num_ignore_eos(seqs: SeqList):
                 inputs = self.create_model_inputs(running, adapters,
                                                   is_prefill)
                 sampling_inputs = SamplingInputs.from_sampling_params(running)
-                history_ids = __gather_history(running, sampling_inputs)
+                all_ids = __gather_all_ids(running, sampling_inputs)
                 num_appendable_ids = __get_num_appendable_ids(running)
                 num_ignore_eos = __get_num_ignore_eos(running)
 
@@ -772,7 +771,7 @@ def __get_num_ignore_eos(seqs: SeqList):
                     inputs=inputs,
                     swap_in_map=schedule_output.swap_in_map,
                     swap_out_map=schedule_output.swap_out_map,
-                    history_ids=history_ids,
+                    all_ids=all_ids,
                     sampling_inputs=sampling_inputs,
                     num_appendable_ids=num_appendable_ids,
                     num_ignore_eos=num_ignore_eos,
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 6c08ad388b..52f99afa35 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -5,6 +5,8 @@
 import torch
 from transformers.generation.logits_process import LogitsWarper
 
+from lmdeploy.messages import LogitsProcessor
+
 from ..messages import SchedulerSequence
 
 
@@ -105,6 +107,7 @@ class SamplingInputs:
     random_offsets: int = None
     max_top_k: int = 1
     min_top_p: float = 1.0
+    logits_processors: List[List[LogitsProcessor]] = None
 
     @classmethod
     def from_sampling_params(cls, seqs: List[SchedulerSequence]):
@@ -118,6 +121,7 @@ def from_sampling_params(cls, seqs: List[SchedulerSequence]):
         stop_words = [None] * batch_size
         random_seeds = [torch.seed() & 0xffffffff] * batch_size
         random_offsets = [None] * batch_size
+        logits_processors = [None] * batch_size
 
         def __gather_params():
             """gather params."""
@@ -138,6 +142,7 @@ def __gather_params():
                     bw = bw + sw
                 bad_words[idx] = bw
                 stop_words[idx] = sw
+                logits_processors[idx] = param.logits_processors
 
         def __get_topp(top_p):
             """get topp."""
@@ -201,6 +206,7 @@ def __get_bad_words(bad_words):
             random_offsets=random_offsets,
             max_top_k=max_top_k,
             min_top_p=min_top_p,
+            logits_processors=logits_processors,
         )
         return sampling_input
 
@@ -216,6 +222,16 @@ def to_device(self, device: str):
         return SamplingInputs(**out_dict)
 
 
+def _apply_custom_logits_processors(batched_logits_processors, all_ids,
+                                    logits):
+    """Apply custom logits processors."""
+    for seq_id, processors in enumerate(batched_logits_processors):
+        if processors is not None:
+            for processor in processors:
+                logits[seq_id] = processor(all_ids[seq_id], logits[seq_id])
+    return logits
+
+
 class FusedLogitsProcessor(LogitsWarper):
     """Custom logits processor."""
 
@@ -224,17 +240,17 @@ def __init__(self, sampling_inputs: SamplingInputs,
         self.sampling_inputs: SamplingInputs = sampling_inputs
         self.ignore_eos = ignore_eos
 
-    def __call__(self, input_ids: torch.LongTensor,
-                 scores: torch.FloatTensor) -> torch.FloatTensor:
+    def __call__(self, scores: torch.FloatTensor,
+                 all_ids: torch.LongTensor) -> torch.FloatTensor:
         r"""
         Args:
-            input_ids (torch.LongTensor):
-                Indices of input sequence tokens in the vocabulary.
             scores (torch.FloatTensor):
                 Prediction scores of a language modeling head.
                 These can be logits for each vocabulary when not using
                 beam search or log softmax for each vocabulary token
                 when using beam search
+            all_ids (torch.LongTensor): All the token ids.
+
 
         Return:
             torch.FloatTensor: The processed prediction scores.
@@ -243,9 +259,14 @@ def __call__(self, input_ids: torch.LongTensor,
         sampling_inputs = self.sampling_inputs
         scores = scores.clone()
 
+        custom_logits_processors = self.sampling_inputs.logits_processors
+        if any(custom_logits_processors):
+            scores = _apply_custom_logits_processors(custom_logits_processors,
+                                                     all_ids, scores)
+
         repetition_penalty = sampling_inputs.repetition_penalty
         if repetition_penalty is not None:
-            scores = _process_repetition_penalty(scores, input_ids,
+            scores = _process_repetition_penalty(scores, all_ids,
                                                  repetition_penalty)
 
         temperature = sampling_inputs.temperature
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index dae679e2d4..c0aa9cf61d 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -2,12 +2,12 @@
 import enum
 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 from torch import Tensor
 
-from lmdeploy.messages import EngineGenerationConfig
+from lmdeploy.messages import EngineGenerationConfig, LogitsProcessor
 from lmdeploy.utils import get_logger
 
 from .block import LogicalTokenBlocks
@@ -46,6 +46,7 @@ class SamplingParam:
     bad_words: List[int] = field(default_factory=list)
     max_new_tokens: int = 512
     min_new_tokens: int = 0
+    logits_processors: Optional[List[LogitsProcessor]] = None
 
     @classmethod
     def from_gen_config(self, gen_config: EngineGenerationConfig):
@@ -97,7 +98,8 @@ def from_gen_config(self, gen_config: EngineGenerationConfig):
                              stop_words=stop_words,
                              bad_words=bad_words,
                              max_new_tokens=max_new_tokens,
-                             min_new_tokens=min_new_tokens)
+                             min_new_tokens=min_new_tokens,
+                             logits_processors=gen_config.logits_processors)
 
 
 class MessageStatus(enum.Enum):
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 040f03ec2a..6eea1066cc 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -3,6 +3,7 @@
 import copy
 import os
 import time
+from functools import partial
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Literal, Optional, Union
 
@@ -13,8 +14,8 @@
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 
 from lmdeploy.archs import get_task
-from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
-                               TurbomindEngineConfig)
+from lmdeploy.messages import (GenerationConfig, LogitsProcessor,
+                               PytorchEngineConfig, TurbomindEngineConfig)
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.serve.openai.protocol import (  # noqa: E501
@@ -239,6 +240,40 @@ async def health() -> Response:
     return Response(status_code=200)
 
 
+# modified from https://github.com/vllm-project/vllm/blob/v0.5.4/vllm/entrypoints/openai/logits_processors.py#L51  # noqa
+def logit_bias_logits_processor(logit_bias: Union[Dict[int, float],
+                                                  Dict[str, float]],
+                                tokenizer) -> LogitsProcessor:
+    try:
+        # Convert token_id to integer
+        # Clamp the bias between -100 and 100 per OpenAI API spec
+        clamped_logit_bias: Dict[int, float] = {
+            int(token_id): min(100.0, max(-100.0, bias))
+            for token_id, bias in logit_bias.items()
+        }
+    except ValueError as exc:
+        raise ValueError(
+            'Found token_id in logit_bias that is not '
+            'an integer or string representing an integer') from exc
+
+    # Check if token_id is within the vocab size
+    for token_id, bias in clamped_logit_bias.items():
+        if token_id < 0 or token_id >= tokenizer.vocab_size:
+            raise ValueError(f'token_id {token_id} in logit_bias contains '
+                             'out-of-vocab token id')
+
+    def _logit_bias_processor(
+        logit_bias,
+        token_ids,
+        logits,
+    ):
+        for token_id, bias in logit_bias.items():
+            logits[token_id] = logits[token_id] + bias
+        return logits
+
+    return partial(_logit_bias_processor, clamped_logit_bias)
+
+
 @app.post('/v1/chat/completions', dependencies=[Depends(check_api_key)])
 async def chat_completions_v1(request: ChatCompletionRequest,
                               raw_request: Request = None):
@@ -263,6 +298,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         1.0 means no penalty
     - stop (str | List[str] | None): To stop generating further
         tokens. Only accept stop words that's encoded to one token idex.
+    - logit_bias (Dict): Bias to logits. Only supported in pytorch engine.
     - tools (List): A list of tools the model may call. Currently, only
         internlm2 functions are supported as a tool. Use this to specify a
         list of functions for which the model can generate JSON inputs.
@@ -281,8 +317,6 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         in the decoding. Default to be True.
 
     Currently we do not support the following features:
-    - function_call (Users should implement this by themselves)
-    - logit_bias (not supported yet)
     - presence_penalty (replaced with repetition_penalty)
     - frequency_penalty (replaced with repetition_penalty)
     """
@@ -308,10 +342,20 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     if isinstance(request.stop, str):
         request.stop = [request.stop]
 
-    gen_logprobs = None
+    gen_logprobs, logits_processors = None, None
     if request.logprobs and request.top_logprobs:
         gen_logprobs = request.top_logprobs
 
+    if request.logit_bias is not None:
+        try:
+            logits_processors = [
+                logit_bias_logits_processor(
+                    request.logit_bias,
+                    VariableInterface.async_engine.tokenizer.model)
+            ]
+        except Exception as e:
+            return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens,
         logprobs=gen_logprobs,
@@ -321,7 +365,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         repetition_penalty=request.repetition_penalty,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
-        skip_special_tokens=request.skip_special_tokens)
+        skip_special_tokens=request.skip_special_tokens,
+        logits_processors=logits_processors)
 
     tools = None
     if request.tools and request.tool_choice != 'none':
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 57eecc45f3..9455fe584c 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -99,6 +99,7 @@ class ChatCompletionRequest(BaseModel):
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = None
     n: Optional[int] = 1
+    logit_bias: Optional[Dict[str, float]] = None
     max_tokens: Optional[int] = Field(default=None, examples=[None])
     stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None])  # noqa
     # yapf: enable

From 3d715a317304325f1cc6e5811cae6bb2fc5b5dbf Mon Sep 17 00:00:00 2001
From: "q.yao" <streetyao@live.com>
Date: Thu, 22 Aug 2024 15:06:09 +0800
Subject: [PATCH 33/39] torch engine optimize prefill for long context (#1962)

* optimize prefill

* merge main, update attention implementation

* check window

* request no response

* fix response

* staged attention

* optimize prefill
---
 lmdeploy/pytorch/engine/engine.py             |  71 ++-
 lmdeploy/pytorch/engine/engine_instance.py    |  19 +-
 .../pytorch/kernels/cuda/pagedattention.py    | 545 +++++-------------
 3 files changed, 215 insertions(+), 420 deletions(-)

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 7b5d948396..d37ab1b8c7 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -86,6 +86,21 @@ def _get_adapter_ids(seqs: SeqList, adapters: AdapterList):
     return adapter_ids
 
 
+def _check_finish(scheduler: Scheduler, current_iter: int):
+    """dynamic prefill interval."""
+    if not scheduler.has_waiting():
+        return False
+    scheduler_config = scheduler.scheduler_config
+    max_prefill_interval = scheduler_config.prefill_interval
+    max_batches = scheduler_config.max_batches
+    num_batches = len(scheduler.running)
+    ratio = num_batches / max_batches
+    min_iter = max_prefill_interval * ratio
+    if current_iter >= min_iter:
+        return True
+    return False
+
+
 class Engine:
     """The inference engine of lmdeploy pytorch.
 
@@ -99,10 +114,10 @@ def __init__(self,
                  model_path: str,
                  engine_config: PytorchEngineConfig = None,
                  trust_remote_code: bool = True) -> None:
-        check_env(engine_config.device_type)
-        check_model(model_path, trust_remote_code)
         if engine_config is None:
             engine_config = PytorchEngineConfig()
+        check_env(engine_config.device_type)
+        check_model(model_path, trust_remote_code)
         if engine_config.adapters is not None:
             check_adapters(list(engine_config.adapters.values()))
 
@@ -245,31 +260,37 @@ def _on_add_session(self, reqs: Request, **kwargs):
         """on add session callback."""
         for req in reqs:
             session_id = req.data['session_id']
+            resp = req.data.get('response', True)
             resp_type = ResponseType.SESSION_REPEAT
             if session_id not in self.scheduler.sessions:
                 self.scheduler.add_session(session_id)
                 resp_type = ResponseType.SUCCESS
-            self._response(resp_type, req.sender_id, req.req_id)
+            if resp:
+                self._response(resp_type, req.sender_id, req.req_id)
 
     def _on_stop_session(self, reqs: Request, **kwargs):
         """on stop session callback."""
         for req in reqs:
             session_id = req.data['session_id']
+            resp = req.data.get('response', True)
             resp_type = ResponseType.SESSION_NOT_EXIST
             if session_id in self.scheduler.sessions:
                 self.scheduler.stop_session(session_id)
                 resp_type = ResponseType.SUCCESS
-            self._response(resp_type, req.sender_id, req.req_id)
+            if resp:
+                self._response(resp_type, req.sender_id, req.req_id)
 
     def _on_end_session(self, reqs: Request, **kwargs):
         """on end session callback."""
         for req in reqs:
             session_id = req.data['session_id']
+            resp = req.data.get('response', True)
             resp_type = ResponseType.SESSION_NOT_EXIST
             if session_id in self.scheduler.sessions:
                 self.scheduler.end_session(session_id)
                 resp_type = ResponseType.SUCCESS
-            self._response(resp_type, req.sender_id, req.req_id)
+            if resp:
+                self._response(resp_type, req.sender_id, req.req_id)
 
     def _on_add_message(self, reqs: Request, **kwargs):
         """on add message callback."""
@@ -485,10 +506,10 @@ def async_sampling_logits(self, logits: torch.Tensor,
 
         def __get_last_logits():
             """get last logits."""
-            if inputs.is_decoding:
+            seq_length = inputs.seq_length
+            if len(seq_length) == logits.size(0):
                 return logits
 
-            seq_length = inputs.seq_length
             last_idx = seq_length.cumsum(-1) - 1
             return logits[last_idx, :]
 
@@ -520,7 +541,8 @@ def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
 
     @logging_timer('ModelForward', logger)
     async def _async_model_forward(self, inputs: ModelInputs,
-                                   swap_in_map: Dict, swap_out_map: Dict):
+                                   swap_in_map: Dict, swap_out_map: Dict,
+                                   return_logits: bool):
         """model forward."""
         max_prefill_token_num = self.cache_config.max_prefill_token_num
         swap_done = False
@@ -536,6 +558,11 @@ def __init__(self, max_seq_len):
             def gather(self, output):
                 """gather."""
                 logits = output['logits']
+
+                if not return_logits:
+                    self._out_logits = logits
+                    return
+
                 out_logits = self._out_logits
                 start = self._start
                 seq_len = logits.size(-2)
@@ -551,6 +578,8 @@ def gather(self, output):
 
             def get_logits(self):
                 """get logits."""
+                if not return_logits:
+                    return self._out_logits[:, -1:]
                 torch.cuda.synchronize()
                 return self._out_logits
 
@@ -584,7 +613,11 @@ async def __long_context_single_forward(inputs):
             return tmp_out
 
         if inputs.input_ids.numel() <= max_prefill_token_num:
-            return await __forward(inputs)
+            ret = await __forward(inputs)
+            if not return_logits and not inputs.is_decoding:
+                last_token_loc = inputs.seq_length.cumsum(0) - 1
+                ret['logits'] = ret['logits'][:, last_token_loc]
+            return ret
         else:
             return await __long_context_single_forward(inputs)
 
@@ -648,7 +681,8 @@ async def _async_step_background(
             self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict,
             all_ids: torch.Tensor, sampling_inputs: SamplingInputs,
             num_appendable_ids: torch.LongTensor,
-            num_ignore_eos: torch.LongTensor, output_que: asyncio.Queue):
+            num_ignore_eos: torch.LongTensor, return_logits: bool,
+            output_que: asyncio.Queue):
         """asyc forward task."""
 
         def __update_inputs(next_token_ids):
@@ -676,9 +710,11 @@ def __update_inputs(next_token_ids):
 
         for idx in range(loop_count):
             # inference
-            output = await self._async_model_forward(inputs,
-                                                     swap_in_map=swap_in_map,
-                                                     swap_out_map=swap_out_map)
+            output = await self._async_model_forward(
+                inputs,
+                swap_in_map=swap_in_map,
+                swap_out_map=swap_out_map,
+                return_logits=return_logits)
             logits = output['logits']
             logits = logits[0]  # [bs, seq, prob] -> [seq, prob]
 
@@ -694,6 +730,7 @@ def __update_inputs(next_token_ids):
             # send output
             stopped = stopped.cpu()
             finish = stopped.all().item() or (idx == loop_count - 1)
+            finish = finish or _check_finish(self.scheduler, idx)
             output = (next_token_ids.cpu(), logits, stopped)
             output_que.put_nowait((finish, output))
 
@@ -745,6 +782,10 @@ def __get_num_ignore_eos(seqs: SeqList):
             ]
             return torch.tensor(ret)
 
+        def __need_logits(seqs: SeqList):
+            """need logits."""
+            return any(seq.return_logits for seq in seqs)
+
         while True:
             is_prefill = await in_que.get()
             try:
@@ -763,6 +804,7 @@ def __get_num_ignore_eos(seqs: SeqList):
                 all_ids = __gather_all_ids(running, sampling_inputs)
                 num_appendable_ids = __get_num_appendable_ids(running)
                 num_ignore_eos = __get_num_ignore_eos(running)
+                return_logits = __need_logits(running)
 
                 self._running = running
                 self._inputs = inputs
@@ -775,6 +817,7 @@ def __get_num_ignore_eos(seqs: SeqList):
                     sampling_inputs=sampling_inputs,
                     num_appendable_ids=num_appendable_ids,
                     num_ignore_eos=num_ignore_eos,
+                    return_logits=return_logits,
                     output_que=out_que,
                 )
             except Exception as e:
@@ -815,6 +858,8 @@ async def __step(prefill: bool):
             in_que.put_nowait(prefill)
             finish = False
             while not finish:
+                if self.req_manager.has_requests():
+                    self.req_manager.step()
                 finish, out = await out_que.get()
                 try:
                     if isinstance(out, Exception):
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 4aa930ee8f..9d9ebf9198 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -42,10 +42,8 @@ async def async_try_add_session(req_sender: RequestSender, session_id: int):
 
 async def async_end(req_sender: RequestSender, session_id: int):
     """End the given session."""
-    resp = await req_sender.async_send(RequestType.END_SESSION,
-                                       dict(session_id=session_id))
-    _check_resp_success(resp, (f'Failed to end session: {session_id}. '
-                               f'Error: {resp.type}.'))
+    await req_sender.async_send_async(
+        RequestType.END_SESSION, dict(session_id=session_id, response=False))
 
 
 async def async_cancel(req_sender: RequestSender, session_id: int):
@@ -71,10 +69,8 @@ def try_add_session(req_sender: RequestSender, session_id: int):
 
 def end(req_sender: RequestSender, session_id: int):
     """End the given session."""
-    resp = req_sender.send(RequestType.END_SESSION,
-                           dict(session_id=session_id))
-    _check_resp_success(resp, (f'Failed to end session: {session_id}. '
-                               f'Error: {resp.type}.'))
+    req_sender.send_async(RequestType.END_SESSION,
+                          dict(session_id=session_id, response=False))
 
 
 def cancel(req_sender: RequestSender, session_id: int):
@@ -156,7 +152,9 @@ async def async_stream_infer(
             return
         gen_config = gen_config or EngineGenerationConfig()
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
-        await async_try_add_session(self.req_sender, session_id)
+        await self.req_sender.async_send_async(
+            RequestType.ADD_SESSION, dict(session_id=session_id,
+                                          response=False))
         input_embeddings_new: List[InputEmbeddings] = None
         if input_embeddings is not None and len(input_embeddings) > 0:
             assert len(input_embeddings) == len(input_embedding_ranges)
@@ -272,7 +270,8 @@ def __call_async():
 
         gen_config = gen_config or EngineGenerationConfig()
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
-        try_add_session(self.req_sender, session_id)
+        self.req_sender.send_async(RequestType.ADD_SESSION,
+                                   dict(session_id=session_id, response=False))
         input_embeddings_new: List[InputEmbeddings] = None
         if input_embeddings is not None and len(input_embeddings) > 0:
             assert len(input_embeddings) == len(input_embedding_ranges)
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index b02bf2eb28..ebfb65cf75 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -17,208 +17,10 @@
 assert TRITON_VERSION >= version.parse('2.1.0')
 
 
-@triton.jit
-def _load_block_offsets(offset_ptr, block_id, BLOCK: tl.constexpr):
-    """load block offsets."""
-    offs_n = tl.arange(0, BLOCK)
-    return tl.load(offset_ptr + block_id) * BLOCK + offs_n
-
-
-@triton.autotune(configs=[
-    triton.Config({}, num_stages=1, num_warps=16),
-    triton.Config({}, num_stages=1, num_warps=8),
-    triton.Config({}, num_stages=1, num_warps=4),
-],
-                 key=['BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
-@wrap_jit_func(type_hint=dict(
-    Q=torch.Tensor,
-    K=torch.Tensor,
-    V=torch.Tensor,
-    sm_scale=float,
-    KV_seqlens=torch.Tensor,
-    Block_offsets=torch.Tensor,
-    Acc_out=torch.Tensor,
-    stride_qbs=int,
-    stride_qh=int,
-    stride_qd=int,
-    stride_kbs=int,
-    stride_kh=int,
-    stride_kd=int,
-    stride_vbs=int,
-    stride_vh=int,
-    stride_vd=int,
-    stride_ok=int,
-    stride_obs=int,
-    stride_oh=int,
-    stride_od=int,
-    stride_boffb=int,
-    kv_group_num=torch.int32,
-    block_per_cta=torch.int32,
-    window_size=torch.int32,
-    head_size=torch.int32,
-    head_size_v=torch.int32,
-    shared_kv=bool,
-    BLOCK_DMODEL=torch.int32,
-    BLOCK_DV=torch.int32,
-    BLOCK_N=torch.int32,
-))
-@triton.jit
-def _fwd_split_kernel(
-    Q,
-    K,
-    V,
-    sm_scale,
-    KV_seqlens,
-    Block_offsets,
-    Acc_out,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_ok,
-    stride_obs,
-    stride_oh,
-    stride_od,
-    stride_boffb,
-    kv_group_num,
-    block_per_cta,
-    window_size: tl.constexpr,
-    head_size: tl.constexpr,
-    head_size_v: tl.constexpr,
-    shared_kv: tl.constexpr,
-    logit_softcapping: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_DV: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    """first step kernel of split k attention."""
-    cur_batch = tl.program_id(0)
-    cur_head = tl.program_id(1)
-    split_k_id = tl.program_id(2)
-
-    cur_kv_head = cur_head // kv_group_num
-
-    q_seqlen = 1
-    kv_seqlen = tl.load(KV_seqlens + cur_batch)
-    history_len = kv_seqlen - q_seqlen
-
-    # initialize offsets
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-    mask_d = offs_d < head_size
-    offs_dv = tl.arange(0, BLOCK_DV)
-    mask_dv = offs_dv < head_size_v
-    off_q = (cur_batch * stride_qbs + cur_head * stride_qh +
-             offs_d * stride_qd)
-    off_k = (cur_kv_head * stride_kh + offs_d[None, :] * stride_kd)
-    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd)
-
-    q = tl.load(Q + off_q, mask=mask_d, other=0).to(tl.float32)
-
-    k_ptrs = K + off_k
-    v_ptrs = V + off_v
-
-    block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
-
-    # initialize pointer to m and l
-    m_i = -float('inf')
-    l_i = float(0)
-    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
-
-    kv_len_per_prog = block_per_cta * BLOCK_N
-    loop_start = kv_len_per_prog * split_k_id
-    loop_end = tl.minimum(loop_start + kv_len_per_prog, kv_seqlen)
-
-    # load block offset
-    # dirty
-    start_block_id = loop_start // BLOCK_N
-    if window_size > 0:
-        start_block_id = tl.maximum(history_len - window_size,
-                                    loop_start) // BLOCK_N
-        kv_min_loc = tl.maximum(history_len - window_size, 0)
-    b_offset = _load_block_offsets(block_offset_ptrs, start_block_id, BLOCK_N)
-
-    loop_start = start_block_id * BLOCK_N
-    for start_n in range(loop_start, loop_end, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-
-        mask = (start_n + offs_n[:, None]) < kv_seqlen
-
-        # -- compute qk ----
-        k = tl.load(
-            k_ptrs + b_offset[:, None] * stride_kbs,
-            mask=mask & mask_d[None, :],
-            other=0.0,
-        )
-
-        if shared_kv:
-            v = k
-        else:
-            v = tl.load(
-                v_ptrs + b_offset[:, None] * stride_vbs,
-                mask=mask & mask_dv[None, :],
-                other=0.0,
-            )
-
-        # prefetch b_offset
-        if start_n + BLOCK_N < loop_end:
-            start_block_id += 1
-            b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
-                                           BLOCK_N)
-
-        qk = tl.sum(q[None, :] * k, 1)
-        qk *= sm_scale
-        if logit_softcapping > 0.0:
-            qk = qk / logit_softcapping
-            qk = tl.math.tanh(qk)
-            qk = qk * logit_softcapping
-        # NOTE: inf - inf = nan, and nan will leads to error
-        qk_mask = history_len >= (start_n + offs_n)
-        if window_size > 0:
-            qk_mask = qk_mask and ((start_n + offs_n) >= kv_min_loc)
-        qk = tl.where(
-            qk_mask,
-            qk,
-            -float('inf'),
-        )
-
-        # -- compute p, m_i and l_i
-        m_i_new = tl.maximum(m_i, tl.max(qk, 0))
-        p = tl.exp(qk - m_i_new)
-        alpha = tl.exp(m_i - m_i_new)
-        l_i_new = alpha * l_i + tl.sum(p, 0)
-
-        # -- update output accumulator --
-        # scale acc
-        acc = acc * alpha
-
-        # update acc
-        p_new = p.to(v.dtype)
-        acc += tl.sum(p_new[:, None] * v, 0)
-        # update m_i and l_i
-        l_i = l_i_new
-        m_i = m_i_new
-
-    # initialize pointers to output
-    off_acc = (cur_batch * stride_obs + split_k_id * stride_ok +
-               cur_head * stride_oh + offs_dv * stride_od)
-    tl.store(Acc_out + off_acc, acc, mask=mask_dv)
-
-    off_meta = (cur_batch * stride_obs + split_k_id * stride_ok +
-                cur_head * stride_oh + head_size_v)
-    tl.store(Acc_out + off_meta + tl.arange(0, 1), m_i)
-    tl.store(Acc_out + off_meta + 1 + tl.arange(0, 1), l_i)
-
-
 @triton.autotune(configs=[
-    triton.Config({}, num_stages=1, num_warps=16),
-    triton.Config({}, num_stages=1, num_warps=8),
-    triton.Config({}, num_stages=1, num_warps=4),
+    triton.Config({}, num_stages=2, num_warps=16),
+    triton.Config({}, num_stages=2, num_warps=8),
+    triton.Config({}, num_stages=2, num_warps=4),
 ],
                  key=['BLOCK_H', 'BLOCK_N', 'BLOCK_DMODEL', 'BLOCK_DV'])
 @wrap_jit_func(type_hint=dict(
@@ -264,28 +66,30 @@ def _fwd_grouped_split_kernel(
     KV_seqlens,
     Block_offsets,
     Acc_out,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_ok,
-    stride_obs,
-    stride_oh,
-    stride_od,
+    stride_qbs: tl.constexpr,
+    stride_qh: tl.constexpr,
+    stride_qd: tl.constexpr,
+    stride_kp: tl.constexpr,
+    stride_kbs: tl.constexpr,
+    stride_kh: tl.constexpr,
+    stride_kd: tl.constexpr,
+    stride_vp: tl.constexpr,
+    stride_vbs: tl.constexpr,
+    stride_vh: tl.constexpr,
+    stride_vd: tl.constexpr,
+    stride_ok: tl.constexpr,
+    stride_obs: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_od: tl.constexpr,
     stride_boffb,
     kv_group_num: tl.constexpr,
-    block_per_cta,
     window_size: tl.constexpr,
     head_size: tl.constexpr,
     head_size_v: tl.constexpr,
     num_heads_q: tl.constexpr,
     shared_kv: tl.constexpr,
     logit_softcapping: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_DV: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -297,23 +101,32 @@ def _fwd_grouped_split_kernel(
     cur_kv_head = tl.program_id(1)
     split_k_id = tl.program_id(2)
 
-    heads_per_cta = min(BLOCK_H, kv_group_num)
-    cur_head = cur_kv_head * heads_per_cta + tl.arange(0, BLOCK_H)
-    mask_h = cur_head < cur_kv_head * heads_per_cta + heads_per_cta
+    if BLOCK_H < kv_group_num:
+        HEAD_PER_CTA: tl.constexpr = BLOCK_H
+    else:
+        HEAD_PER_CTA: tl.constexpr = kv_group_num
+    cur_head = cur_kv_head * HEAD_PER_CTA + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < cur_kv_head * HEAD_PER_CTA + HEAD_PER_CTA
     mask_h = mask_h & (cur_head < num_heads_q)
 
     q_seqlen = 1
     kv_seqlen = tl.load(KV_seqlens + cur_batch)
+    if kv_seqlen <= 0:
+        return
     history_len = kv_seqlen - q_seqlen
 
     # initialize offsets
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     mask_d = offs_d < head_size
+    offs_d = offs_d % head_size
     offs_dv = tl.arange(0, BLOCK_DV)
     mask_dv = offs_dv < head_size_v
-    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd)
-    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd)
+    offs_dv = offs_dv % head_size_v
+    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+             offs_n[None, :] * stride_kbs)
+    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+             offs_n[:, None] * stride_vbs)
 
     off_q = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
              offs_d[None, :] * stride_qd)
@@ -325,12 +138,14 @@ def _fwd_grouped_split_kernel(
     if BLOCK_DMODEL1 != 0:
         offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
         mask_d1 = offs_d1 < head_size
+        offs_d1 = offs_d1 % head_size
         off_q1 = (cur_batch * stride_qbs + cur_head[:, None] * stride_qh +
                   offs_d1[None, :] * stride_qd)
         q1 = tl.load(Q + off_q1,
                      mask=mask_h[:, None] & mask_d1[None, :],
                      other=0)
-        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd)
+        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                  offs_n[None, :] * stride_kbs)
         k1_ptrs = K + off_k1
 
     block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
@@ -340,7 +155,9 @@ def _fwd_grouped_split_kernel(
     l_i = tl.zeros([BLOCK_H], dtype=tl.float32)
     acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
 
-    kv_len_per_prog = block_per_cta * BLOCK_N
+    num_total_blocks = tl.cdiv(kv_seqlen, BLOCK_N)
+    BLOCK_PER_CTA = tl.cdiv(num_total_blocks, SPLIT_K)
+    kv_len_per_prog = BLOCK_PER_CTA * BLOCK_N
     loop_start = kv_len_per_prog * split_k_id
     loop_end = tl.minimum(loop_start + kv_len_per_prog, kv_seqlen)
 
@@ -351,41 +168,21 @@ def _fwd_grouped_split_kernel(
         start_block_id = tl.maximum(history_len - window_size,
                                     loop_start) // BLOCK_N
         kv_min_loc = tl.maximum(history_len - window_size, 0)
-    b_offset = _load_block_offsets(block_offset_ptrs, start_block_id, BLOCK_N)
 
     loop_start = start_block_id * BLOCK_N
     for start_n in range(loop_start, loop_end, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
-
-        mask = (start_n + offs_n) < kv_seqlen
+        b_offset = tl.load(block_offset_ptrs + start_n // BLOCK_N)
 
         # -- compute qk ----
-        k = tl.load(
-            k_ptrs + b_offset[None, :] * stride_kbs,
-            mask=mask[None, :] & mask_d[:, None],
-            other=0.0,
-        )
+        k = tl.load(k_ptrs + b_offset * stride_kp)
         if BLOCK_DMODEL1 != 0:
-            k1 = tl.load(
-                k1_ptrs + b_offset[None, :] * stride_kbs,
-                mask=mask[None, :] & mask_d1[:, None],
-                other=0.0,
-            )
+            k1 = tl.load(k1_ptrs + b_offset * stride_kp)
 
         if shared_kv:
             v = tl.trans(k)
         else:
-            v = tl.load(
-                v_ptrs + b_offset[:, None] * stride_vbs,
-                mask=mask[:, None] & mask_dv[None, :],
-                other=0.0,
-            )
-
-        # prefetch b_offset
-        if start_n + BLOCK_N < loop_end:
-            start_block_id += 1
-            b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
-                                           BLOCK_N)
+            v = tl.load(v_ptrs + b_offset * stride_vp)
 
         qk = tl.zeros([BLOCK_H, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
@@ -397,19 +194,20 @@ def _fwd_grouped_split_kernel(
             qk = tl.math.tanh(qk)
             qk = qk * logit_softcapping
         # NOTE: inf - inf = nan, and nan will leads to error
-        qk_mask = history_len >= (start_n + offs_n)
-        if window_size > 0:
-            qk_mask = qk_mask and ((start_n + offs_n) >= kv_min_loc)
-        qk = tl.where(
-            qk_mask[None, :],
-            qk,
-            -float('inf'),
-        )
+        if start_n + BLOCK_N > history_len or window_size > 0:
+            qk_mask = history_len >= (start_n + offs_n)
+            if window_size > 0:
+                qk_mask = qk_mask and ((start_n + offs_n) >= kv_min_loc)
+            qk = tl.where(
+                qk_mask[None, :],
+                qk,
+                -float('inf'),
+            )
 
         # -- compute p, m_i and l_i
         m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = tl.exp(qk - m_i_new[:, None])
-        alpha = tl.exp(m_i - m_i_new)
+        p = tl.math.fast_expf(qk - m_i_new[:, None])
+        alpha = tl.math.fast_expf(m_i - m_i_new)
         l_i_new = alpha * l_i + tl.sum(p, 1)
 
         # -- update output accumulator --
@@ -482,7 +280,7 @@ def _reduce_split_kernel(
     l_k = tl.load(Acc + offs_mi + 1)
 
     m_max = tl.max(m_k, 0)
-    alpha = tl.exp(m_k - m_max)
+    alpha = tl.math.fast_expf(m_k - m_max)
     acc_k = acc_k * alpha[:, None]
     l_k = l_k * alpha
 
@@ -537,20 +335,22 @@ def _fwd_kernel(
     KV_seqlens,
     Block_offsets,
     Out,
-    stride_qbs,
-    stride_qh,
-    stride_qd,
-    stride_kbs,
-    stride_kh,
-    stride_kd,
-    stride_vbs,
-    stride_vh,
-    stride_vd,
-    stride_obs,
-    stride_oh,
-    stride_od,
+    stride_qbs: tl.constexpr,
+    stride_qh: tl.constexpr,
+    stride_qd: tl.constexpr,
+    stride_kp: tl.constexpr,
+    stride_kbs: tl.constexpr,
+    stride_kh: tl.constexpr,
+    stride_kd: tl.constexpr,
+    stride_vp: tl.constexpr,
+    stride_vbs: tl.constexpr,
+    stride_vh: tl.constexpr,
+    stride_vd: tl.constexpr,
+    stride_obs: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_od: tl.constexpr,
     stride_boffb,
-    kv_group_num,
+    kv_group_num: tl.constexpr,
     window_size: tl.constexpr,
     head_size: tl.constexpr,
     head_size_v: tl.constexpr,
@@ -563,9 +363,9 @@ def _fwd_kernel(
     BLOCK_DMODEL1: tl.constexpr,
 ):
     """paged attention kernel."""
-    cur_batch = tl.program_id(0)
+    cur_batch = tl.program_id(2)
     cur_head = tl.program_id(1)
-    start_m = tl.program_id(2)
+    start_m = tl.program_id(0)
 
     cur_kv_head = cur_head // kv_group_num
 
@@ -575,18 +375,24 @@ def _fwd_kernel(
     history_len = kv_seqlen - q_seqlen
 
     block_start_loc = BLOCK_M * start_m
+    if block_start_loc >= q_seqlen:
+        return
 
     # initialize offsets
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     offs_dv = tl.arange(0, BLOCK_DV)
     mask_d = offs_d < head_size
+    offs_d = offs_d % head_size
     mask_dv = offs_dv < head_size_v
+    offs_dv = offs_dv % head_size_v
     offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     off_q = ((q_start_loc + offs_m[:, None]) * stride_qbs +
              cur_head * stride_qh + offs_d[None, :] * stride_qd)
-    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd)
-    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd)
+    off_k = (cur_kv_head * stride_kh + offs_d[:, None] * stride_kd +
+             offs_n[None, :] * stride_kbs)
+    off_v = (cur_kv_head * stride_vh + offs_dv[None, :] * stride_vd +
+             offs_n[:, None] * stride_vbs)
 
     q = tl.load(Q + off_q,
                 mask=(offs_m[:, None] < q_seqlen) & mask_d[None, :],
@@ -598,10 +404,12 @@ def _fwd_kernel(
     if BLOCK_DMODEL1 != 0:
         offs_d1 = BLOCK_DMODEL + tl.arange(0, BLOCK_DMODEL1)
         mask_d1 = offs_d1 < head_size
+        offs_d1 = offs_d1 % head_size
         off_q1 = ((q_start_loc + offs_m[:, None]) * stride_qbs +
                   cur_head * stride_qh + offs_d1[None, :] * stride_qd)
         q1 = tl.load(Q + off_q1, mask=(offs_m[:, None] < q_seqlen) & mask_d1)
-        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd)
+        off_k1 = (cur_kv_head * stride_kh + offs_d1[:, None] * stride_kd +
+                  offs_n[None, :] * stride_kbs)
         k1_ptrs = K + off_k1
 
     block_offset_ptrs = Block_offsets + cur_batch * stride_boffb
@@ -611,45 +419,25 @@ def _fwd_kernel(
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
 
-    block_mask = tl.where(block_start_loc < q_seqlen, 1, 0)
-
     # this is dirty
     start_block_id = kv_seqlen - kv_seqlen
     if window_size > 0:
         start_block_id = tl.maximum(history_len - window_size, 0) // BLOCK_N
         kv_min_loc = tl.maximum(history_len + offs_m - window_size, 0)
-    b_offset = _load_block_offsets(block_offset_ptrs, start_block_id, BLOCK_N)
     kv_start_loc = start_block_id * BLOCK_N
-    for start_n in range(kv_start_loc, block_mask * kv_seqlen, BLOCK_N):
+    for start_n in range(kv_start_loc, kv_seqlen, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
+        b_offset = tl.load(block_offset_ptrs + start_n // BLOCK_N)
 
         # -- compute qk ----
-        k = tl.load(
-            k_ptrs + b_offset[None, :] * stride_kbs,
-            mask=(start_n + offs_n[None, :] < kv_seqlen) & mask_d[:, None],
-            other=0.0,
-        )
+        k = tl.load(k_ptrs + b_offset * stride_kp)
         if BLOCK_DMODEL1 != 0:
-            k1 = tl.load(
-                k1_ptrs + b_offset[None, :] * stride_kbs,
-                mask=(start_n + offs_n[None, :] < kv_seqlen)
-                & mask_d1[:, None],
-                other=0.0,
-            )
+            k1 = tl.load(k1_ptrs + b_offset * stride_kp)
 
         if shared_kv:
             v = tl.trans(k)
         else:
-            v = tl.load(
-                v_ptrs + b_offset[:, None] * stride_vbs,
-                mask=(start_n + offs_n[:, None] < kv_seqlen)
-                & mask_dv[None, :],
-                other=0.0,
-            )
-        if start_n + BLOCK_N < kv_seqlen:
-            start_block_id = start_n // BLOCK_N + 1
-            b_offset = _load_block_offsets(block_offset_ptrs, start_block_id,
-                                           BLOCK_N)
+            v = tl.load(v_ptrs + b_offset * stride_vp)
 
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
@@ -661,21 +449,22 @@ def _fwd_kernel(
             qk = tl.math.tanh(qk)
             qk = qk * logit_softcapping
         # NOTE: inf - inf = nan, and nan will leads to error
-        qk_mask = (history_len + offs_m[:, None]) >= (start_n +
-                                                      offs_n[None, :])
-        if window_size > 0:
-            qk_mask = qk_mask and (
-                (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
-        qk = tl.where(
-            qk_mask,
-            qk,
-            float(-1e30),
-        )
+        if start_n + BLOCK_N > history_len or window_size > 0:
+            qk_mask = (history_len + offs_m[:, None]) >= (start_n +
+                                                          offs_n[None, :])
+            if window_size > 0:
+                qk_mask = qk_mask and (
+                    (start_n + offs_n[None, :]) >= kv_min_loc[:, None])
+            qk = tl.where(
+                qk_mask,
+                qk,
+                float(-1e30),
+            )
 
         # -- compute p, m_i and l_i
         m_i_new = tl.maximum(m_i, tl.max(qk, 1))
-        p = tl.exp(qk - m_i_new[:, None])
-        alpha = tl.exp(m_i - m_i_new)
+        p = tl.math.fast_expf(qk - m_i_new[:, None])
+        alpha = tl.math.fast_expf(m_i - m_i_new)
         l_i_new = alpha * l_i + tl.sum(p, 1)
         # -- update output accumulator --
         # scale acc
@@ -688,7 +477,7 @@ def _fwd_kernel(
         l_i = l_i_new
         m_i = m_i_new
 
-    acc = acc / l_i[:, None]
+    acc = tl.math.fast_dividef(acc, l_i[:, None])
     # initialize pointers to output
     off_o = ((q_start_loc + offs_m[:, None]) * stride_obs +
              cur_head * stride_oh + offs_dv[None, :] * stride_od)
@@ -773,8 +562,8 @@ def _get_block_d(Lk):
         BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lk)
         BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL))
         num_warps = 4
-        num_stages = 1
-        grid = (batch, head, triton.cdiv(max_seqlen, BLOCK_M))
+        num_stages = 2
+        grid = (triton.cdiv(max_seqlen, BLOCK_M), head, batch)
         _fwd_kernel[grid](q,
                           k,
                           v,
@@ -787,9 +576,11 @@ def _get_block_d(Lk):
                           stride_qbs=q.stride(-3),
                           stride_qh=q.stride(-2),
                           stride_qd=q.stride(-1),
+                          stride_kp=k.stride(-4),
                           stride_kbs=k.stride(-3),
                           stride_kh=k.stride(-2),
                           stride_kd=k.stride(-1),
+                          stride_vp=v.stride(-4),
                           stride_vbs=v.stride(-3),
                           stride_vh=v.stride(-2),
                           stride_vd=v.stride(-1),
@@ -813,89 +604,49 @@ def _get_block_d(Lk):
                           **kernel_meta)
     else:
         SPLIT_K = 4
-        block_per_cta = triton.cdiv(block_offsets.size(-1), SPLIT_K)
         acc = q.new_empty(batch, head, SPLIT_K, Lv + 2, dtype=torch.float32)
-        if kv_group_num <= 2 or shared_kv:
-            BLOCK_DMODEL = triton.next_power_of_2(Lk)
-            if shared_kv:
-                BLOCK_DV = BLOCK_DMODEL
-            else:
-                BLOCK_DV = triton.next_power_of_2(Lv)
-            grid = (batch, head, SPLIT_K)
-            _fwd_split_kernel[grid](q,
-                                    k,
-                                    v,
-                                    sm_scale,
-                                    kv_seqlens,
-                                    block_offsets,
-                                    acc,
-                                    stride_qbs=q.stride(-3),
-                                    stride_qh=q.stride(-2),
-                                    stride_qd=q.stride(-1),
-                                    stride_kbs=k.stride(-3),
-                                    stride_kh=k.stride(-2),
-                                    stride_kd=k.stride(-1),
-                                    stride_vbs=v.stride(-3),
-                                    stride_vh=v.stride(-2),
-                                    stride_vd=v.stride(-1),
-                                    stride_ok=acc.stride(-2),
-                                    stride_obs=acc.stride(-4),
-                                    stride_oh=acc.stride(-3),
-                                    stride_od=acc.stride(-1),
-                                    stride_boffb=block_offsets.stride(0),
-                                    kv_group_num=kv_group_num,
-                                    block_per_cta=block_per_cta,
-                                    window_size=window_size,
-                                    head_size=Lk,
-                                    head_size_v=Lv,
-                                    shared_kv=shared_kv,
-                                    logit_softcapping=logit_softcapping,
-                                    BLOCK_DMODEL=BLOCK_DMODEL,
-                                    BLOCK_DV=BLOCK_DV,
-                                    BLOCK_N=BLOCK,
-                                    **kernel_meta)
-        else:
-            BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lk)
-            p2_kv_group_num = triton.next_power_of_2(kv_group_num)
-            BLOCK_H = max(16, min(BLOCK, p2_kv_group_num))
-            grid_1 = triton.cdiv(head, min(BLOCK_H, kv_group_num))
-            grid = (batch, grid_1, SPLIT_K)
-            _fwd_grouped_split_kernel[grid](
-                q,
-                k,
-                v,
-                sm_scale,
-                kv_seqlens,
-                block_offsets,
-                acc,
-                stride_qbs=q.stride(-3),
-                stride_qh=q.stride(-2),
-                stride_qd=q.stride(-1),
-                stride_kbs=k.stride(-3),
-                stride_kh=k.stride(-2),
-                stride_kd=k.stride(-1),
-                stride_vbs=v.stride(-3),
-                stride_vh=v.stride(-2),
-                stride_vd=v.stride(-1),
-                stride_ok=acc.stride(-2),
-                stride_obs=acc.stride(-4),
-                stride_oh=acc.stride(-3),
-                stride_od=acc.stride(-1),
-                stride_boffb=block_offsets.stride(0),
-                kv_group_num=kv_group_num,
-                block_per_cta=block_per_cta,
-                window_size=window_size,
-                head_size=Lk,
-                head_size_v=Lv,
-                num_heads_q=head,
-                shared_kv=shared_kv,
-                logit_softcapping=logit_softcapping,
-                BLOCK_DMODEL=BLOCK_DMODEL,
-                BLOCK_DV=BLOCK_DV,
-                BLOCK_N=BLOCK,
-                BLOCK_H=BLOCK_H,
-                BLOCK_DMODEL1=BLOCK_DMODEL1,
-                **kernel_meta)
+        BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lk)
+        p2_kv_group_num = triton.next_power_of_2(kv_group_num)
+        BLOCK_H = max(16, min(BLOCK, p2_kv_group_num))
+        grid_1 = triton.cdiv(head, min(BLOCK_H, kv_group_num))
+        grid = (batch, grid_1, SPLIT_K)
+        _fwd_grouped_split_kernel[grid](q,
+                                        k,
+                                        v,
+                                        sm_scale,
+                                        kv_seqlens,
+                                        block_offsets,
+                                        acc,
+                                        stride_qbs=q.stride(-3),
+                                        stride_qh=q.stride(-2),
+                                        stride_qd=q.stride(-1),
+                                        stride_kp=k.stride(-4),
+                                        stride_kbs=k.stride(-3),
+                                        stride_kh=k.stride(-2),
+                                        stride_kd=k.stride(-1),
+                                        stride_vp=v.stride(-4),
+                                        stride_vbs=v.stride(-3),
+                                        stride_vh=v.stride(-2),
+                                        stride_vd=v.stride(-1),
+                                        stride_ok=acc.stride(-2),
+                                        stride_obs=acc.stride(-4),
+                                        stride_oh=acc.stride(-3),
+                                        stride_od=acc.stride(-1),
+                                        stride_boffb=block_offsets.stride(0),
+                                        kv_group_num=kv_group_num,
+                                        window_size=window_size,
+                                        head_size=Lk,
+                                        head_size_v=Lv,
+                                        num_heads_q=head,
+                                        shared_kv=shared_kv,
+                                        logit_softcapping=logit_softcapping,
+                                        SPLIT_K=SPLIT_K,
+                                        BLOCK_DMODEL=BLOCK_DMODEL,
+                                        BLOCK_DV=BLOCK_DV,
+                                        BLOCK_N=BLOCK,
+                                        BLOCK_H=BLOCK_H,
+                                        BLOCK_DMODEL1=BLOCK_DMODEL1,
+                                        **kernel_meta)
 
         num_warps = 4
         grid = (batch, head)

From 2cd7f9597fd31b6f4e09384fa8d21f0b336dcffa Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 22 Aug 2024 19:41:02 +0800
Subject: [PATCH 34/39] set rope_scaling_factor default value None (#2358)

---
 lmdeploy/cli/utils.py      | 2 +-
 lmdeploy/messages.py       | 4 ++--
 lmdeploy/turbomind/chat.py | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index e1f5c41ab2..f25be5ba43 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -182,7 +182,7 @@ def rope_scaling_factor(parser):
 
         return parser.add_argument('--rope-scaling-factor',
                                    type=float,
-                                   default=0.0,
+                                   default=None,
                                    help='Rope scaling factor')
 
     @staticmethod
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 865c2249de..89ddfb6a23 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -151,7 +151,7 @@ class TurbomindEngineConfig:
     cache_block_seq_len: int = 64
     enable_prefix_caching: bool = False
     quant_policy: int = 0
-    rope_scaling_factor: float = 0.0
+    rope_scaling_factor: float = None
     use_logn_attn: bool = False
     download_dir: Optional[str] = None
     revision: Optional[str] = None
@@ -165,7 +165,7 @@ def __post_init__(self):
         assert self.max_batch_size >= 1, 'max_batch_size must be a positive integer'  # noqa
         assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count'  # noqa
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
-        assert self.rope_scaling_factor >= 0, 'invalid rope_scaling_factor'
+        assert self.rope_scaling_factor is None or self.rope_scaling_factor > 0, 'invalid rope_scaling_factor'  # noqa
         assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
         assert self.num_tokens_per_iter >= 0, 'invalid num_tokens_per_iter'
 
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index ba488b77a4..7d06c95a06 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -40,7 +40,7 @@ def main(model_path: str,
          quant_policy: int = 0,
          cache_max_entry_count: float = 0.8,
          cache_block_seq_len: int = 64,
-         rope_scaling_factor: float = 0.0,
+         rope_scaling_factor: float = None,
          enable_prefix_caching: bool = False,
          session_len: int = None,
          stream_output: bool = True,
@@ -63,7 +63,7 @@ def main(model_path: str,
         quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
         cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
         cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
-        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
+        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to None. TurboMind follows the implementation of transformer LlamaAttention
         enable_prefix_caching (bool): whether enable prefix caching
         session_len (int): the length input output tokens
         stream_output (bool): indicator for streaming output or not
@@ -95,8 +95,9 @@ def main(model_path: str,
         cache_block_seq_len=cache_block_seq_len,
         enable_prefix_caching=enable_prefix_caching,
         quant_policy=quant_policy,
-        rope_scaling_factor=rope_scaling_factor,
         tp=tp)
+    if rope_scaling_factor:
+        engine_cfg.rope_scaling_factor = rope_scaling_factor
     print('engine_cfg:\n', engine_cfg, sep='', flush=True)
 
     from lmdeploy import turbomind as tm

From b78c8b9cd1ed00b940603b8c6ecdc279cccbce82 Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Thu, 22 Aug 2024 19:44:20 +0800
Subject: [PATCH 35/39] Refactor turbomind (1/N) (#2352)

* refactor turbomind

* minor

* use `size_t` for size type

* fix

* minor

* print split-fuse param
---
 .../kernels/attention/attention_params.h      |   1 +
 src/turbomind/models/llama/LlamaBatch.cc      | 242 ++++++++---
 src/turbomind/models/llama/LlamaBatch.h       | 108 +++--
 .../models/llama/LlamaDecoderLayerWeight.cc   |  14 +-
 .../models/llama/LlamaDecoderLayerWeight.h    |   2 +-
 src/turbomind/models/llama/LlamaFfnLayer.cc   |   8 +-
 src/turbomind/models/llama/LlamaFfnLayer.h    |  45 +-
 src/turbomind/models/llama/LlamaV2.cc         | 266 +++---------
 src/turbomind/models/llama/LlamaV2.h          | 139 ++-----
 src/turbomind/models/llama/LlamaWeight.cc     |   4 +-
 src/turbomind/models/llama/LlamaWeight.h      |   2 +-
 src/turbomind/models/llama/context.h          |  25 ++
 src/turbomind/models/llama/llama_params.h     |  23 +-
 .../models/llama/unified_attention_layer.cc   |  67 ++-
 .../models/llama/unified_attention_layer.h    |  82 ++--
 src/turbomind/models/llama/unified_decoder.cc |  77 ++--
 src/turbomind/models/llama/unified_decoder.h  |  74 +---
 src/turbomind/python/bind.cpp                 |   6 +-
 .../triton_backend/llama/LlamaTritonModel.cc  | 391 ++++++++----------
 .../triton_backend/llama/LlamaTritonModel.h   |  56 +--
 .../llama/LlamaTritonModelInstance.cc         |  20 +-
 .../llama/LlamaTritonModelInstance.h          |  18 +-
 .../transformer_triton_backend.hpp            |   2 +
 23 files changed, 774 insertions(+), 898 deletions(-)
 create mode 100644 src/turbomind/models/llama/context.h

diff --git a/src/turbomind/kernels/attention/attention_params.h b/src/turbomind/kernels/attention/attention_params.h
index e244de04ea..8e0e52195d 100644
--- a/src/turbomind/kernels/attention/attention_params.h
+++ b/src/turbomind/kernels/attention/attention_params.h
@@ -20,6 +20,7 @@ struct BlockIteratorParams {
     int        block_len;
 };
 
+/// TODO: Rename to attention::Param
 template<typename T>
 struct AttentionParams {
     // token-level buffers, [B, qH + 2kvH, D] or [B, kvH, D]
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 987ac9e86f..8dc0cffeed 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -23,6 +23,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <cuda_runtime.h>
 #include <functional>
 #include <iomanip>
 #include <iterator>
@@ -364,15 +365,15 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
 
         // compute rope scaling factor
         if (r->start_flag) {
-            seq.rope_theta = model_->attn_params_.rotary_embedding_base;
-            if (model_->attn_params_.use_dynamic_ntk) {
-                auto scaling_factor = model_->attn_params_.rope_scaling_factor;
+            seq.rope_theta = model_->attn_param_.rotary_embedding_base;
+            if (model_->attn_param_.use_dynamic_ntk) {
+                auto scaling_factor = model_->attn_param_.rope_scaling_factor;
                 if (scaling_factor >= 1.f) {  // infer by current context length
                     auto max_seq_len = state.h_context_length[idx];
-                    auto max_pos_emb = model_->attn_params_.max_position_embeddings;
+                    auto max_pos_emb = model_->attn_param_.max_position_embeddings;
                     if (max_seq_len > max_pos_emb) {
                         scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1);
-                        float rope_dim = model_->attn_params_.rotary_embedding_dim;
+                        float rope_dim = model_->attn_param_.rotary_embedding_dim;
                         seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f));
                         TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f",
                                     (long)seq.id,
@@ -499,7 +500,7 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
     };
 
     // TM_LOG_INFO("max_input_count %d", max_input_count);
-    auto outcome = sequence_manager_->Materialize(sequences, context_lengths, priorities, step_length_, adjust);
+    auto outcome = sequence_manager_->Materialize(sequences, context_lengths, priorities, 1, adjust);
 
     if (outcome.allocation || outcome.swap_in || outcome.swap_out) {
         dbg(outcome);
@@ -711,9 +712,9 @@ void LlamaBatch<T>::AllocateBuffer(size_t batch_size, size_t session_len, int ca
     const size_t max_batch_block_count =
         batch_size * ((session_len + cache_block_seq_len - 1) / cache_block_seq_len) + 1;
 
-    if (model_->lora_params_.policy == LoraPolicy::kPlora) {
-        lora_mask_buf_ = (int*)allocator_->reMalloc(lora_mask_buf_, sizeof(int) * max_forward_token_num_, false);
-        size_t sz      = sizeof(T) * max_forward_token_num_ * (hidden_units + model_->lora_params_.max_wo_r);
+    if (model_->lora_param_.policy == LoraPolicy::kPlora) {
+        lora_mask_buf_  = (int*)allocator_->reMalloc(lora_mask_buf_, sizeof(int) * max_forward_token_num_, false);
+        const size_t sz = sizeof(T) * max_forward_token_num_ * (hidden_units + model_->lora_param_.max_wo_r);
         context_decoder_output_buf_ = (T*)peer_allocator_->reMalloc(context_decoder_output_buf_, sz, false);
     }
     else {
@@ -932,29 +933,67 @@ void LlamaBatch<T>::FreeBuffer()
 }
 
 template<typename T>
-LlamaBatch<T>::LlamaBatch(const EngineParams& params, int cache_block_seq_len, int quant_policy, LlamaV2<T>* model):
-    max_batch_size_(params.max_batch_size),
-    max_forward_token_num_(params.max_prefill_token_num + params.max_batch_size),
-    max_context_token_num_(params.max_context_token_num),
-    session_len_(params.session_len),
+LlamaBatch<T>::~LlamaBatch()
+{
+    TM_LOG_DEBUG("~LlamaBatch()");
+    shared_state_->request_queue.close();
+
+    internal_thread_.join();
+
+    if (output_thread_.joinable()) {
+        {
+            std::lock_guard lock{output_mutex_};
+            output_stop_token_ = true;
+        }
+        output_cv_.notify_one();
+        output_thread_.join();
+    }
+
+    // The dtor maybe called from unknown thread, set device id before CUDA calls
+    check_cuda_error(cudaSetDevice(device_id_));
+    check_cuda_error(cudaStreamSynchronize(stream_));
+
+    FreeBuffer();
+
+    model_.reset();
+    sequence_manager_.reset();
+    context_.reset();  // This destroy all objects in context except for `stream`
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
+
+    // Destroy the stream in context
+    check_cuda_error(cudaStreamDestroy(stream_));
+}
+
+template<typename T>
+LlamaBatch<T>::LlamaBatch(const EngineParam&           param,
+                          std::unique_ptr<LlamaV2<T>>  model,  // ! This is moved
+                          std::unique_ptr<Context<T>>  ctx,    // ! This is moved
+                          std::shared_ptr<SharedState> state,
+                          int                          device_id):
+    param_(param),
+    shared_state_(state),
+    max_batch_size_(param.max_batch_size),
+    max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size),
+    max_context_token_num_(param.max_context_token_num),
+    num_tokens_per_iter_(param.num_tokens_per_iter),
+    max_prefill_iters_(param.max_prefill_iters),
+    device_id_(device_id),
     rank_(model->tensor_para_.rank_),
-    debug_(model->debug_),
-    step_length_(params.step_length),
-    model_(model),
     data_type_(getTensorType<T>()),
-    num_tokens_per_iter_(params.num_tokens_per_iter),
-    max_prefill_iters_(params.max_prefill_iters)
+    debug_(isDebug()),
+    stream_(ctx->stream),
+    allocator_(ctx->allocator.get()),
+    peer_allocator_(ctx->peer_allocator.get()),
+    cublas_wrapper_(ctx->cublas_wrapper.get()),
+    context_(std::move(ctx)),
+    model_(std::move(model)),
+    session_len_(param.session_len)
 {
-    stream_         = model_->stream_;
-    allocator_      = model_->allocator_;
-    peer_allocator_ = model_->peer_allcator_;
-    cublas_wrapper_ = model_->cublas_wrapper_;
+    const auto cache_block_seq_len = model_->attn_param_.cache_block_seq_len;
 
-    const int elem_bits = quant_policy ? quant_policy : bitsof<T>;
-
-    auto get_free_size = [&] {
-        return GetSyncFreeMemSize(*model_->shared_state_->barrier, model_->shared_state_->free_size);
-    };
+    const auto quant_policy = model_->param_.quant_policy;
+    const int  elem_bits    = quant_policy ? quant_policy : bitsof<T>;
 
     SequenceManager::BlockConfig block_config{
         (int)model_->size_per_head_,
@@ -964,12 +1003,16 @@ LlamaBatch<T>::LlamaBatch(const EngineParams& params, int cache_block_seq_len, i
         elem_bits,
     };
 
-    sequence_manager_.reset(new SequenceManager{model_->num_layer_,
+    const auto get_free_size = [&] {  //
+        return GetSyncFreeMemSize(*shared_state_->barrier, shared_state_->free_size);
+    };
+
+    sequence_manager_.reset(new SequenceManager{model_->layer_num_,
                                                 block_config,
-                                                params.cache_max_block_count,
-                                                params.cache_chunk_size,
-                                                params.enable_prefix_caching,
-                                                model->tensor_para_.rank_,
+                                                param.cache_max_block_count,
+                                                param.cache_chunk_size,
+                                                param.enable_prefix_caching,
+                                                model_->tensor_para_.rank_,
                                                 allocator_,
                                                 get_free_size});
 
@@ -1422,19 +1465,17 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 }
 
 template<typename T>
-void LlamaBatch<T>::InternalThreadEntry(int device_id)
+void LlamaBatch<T>::InternalThreadEntry()
 {
     // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_);
-    check_cuda_error(cudaSetDevice(device_id));
+    check_cuda_error(cudaSetDevice(device_id_));
 
     // Initialize `AnomalyHandler`
     AnomalyHandler::instance().Init(rank_, model_->vocab_size_padded_, model_->end_id_, max_batch_size_, stream_);
 
-    auto& shared_state = model_->shared_state_;
-
-    auto& request_queue  = shared_state->request_queue;
-    auto& infer_requests = shared_state->infer_requests;
-    auto& stop_requests  = shared_state->stop_requests;
+    auto& request_queue  = shared_state_->request_queue;
+    auto& infer_requests = shared_state_->infer_requests;
+    auto& stop_requests  = shared_state_->stop_requests;
 
     GenerationState g{};
 
@@ -1449,8 +1490,8 @@ void LlamaBatch<T>::InternalThreadEntry(int device_id)
             infer_requests.clear();
             if (is_empty || request_counter % request_interval == 0) {
                 // Block if batch is empty
-                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state->abort);
-                if (!shared_state->abort) {
+                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state_->abort);
+                if (!shared_state_->abort) {
                     RejectInvalidRequests(stop_requests, infer_requests);
                 }
             }
@@ -1459,9 +1500,9 @@ void LlamaBatch<T>::InternalThreadEntry(int device_id)
         NvtxScope scope("mainloop");
 
         // wait while rank-0 is dequeueing
-        shared_state->barrier->wait();
+        shared_state_->barrier->wait();
 
-        if (shared_state->abort) {
+        if (shared_state_->abort) {
             TM_LOG_INFO("[InternalThreadEntry] stop requested.");
             return;
         }
@@ -1472,14 +1513,12 @@ void LlamaBatch<T>::InternalThreadEntry(int device_id)
         ProcessInferRequests(infer_requests);
 
         // Wait while shared `requests` is being used
-        shared_state->barrier->wait();
+        shared_state_->barrier->wait();
 
         SendSignals(std::move(signals));
 
         Initialize(g);
 
-        FT_CHECK(step_length_ == 1);
-
         if (state_->active_size) {
             //
             (void)Forward(g);
@@ -1489,7 +1528,7 @@ void LlamaBatch<T>::InternalThreadEntry(int device_id)
                     // Finished requests and corresponding output tensors will be released when notified
                     // wait for all ranks to ensure no rank (except for output thread) will access related
                     // resources
-                    shared_state->barrier->wait();
+                    shared_state_->barrier->wait();
                 }
                 SendSignals(std::move(signals));
             }
@@ -1520,9 +1559,7 @@ template<typename T>
 void LlamaBatch<T>::Start()
 {
     TM_LOG_INFO("LlamaBatch<T>::Start()");
-    int device_id = -1;
-    check_cuda_error(cudaGetDevice(&device_id));
-    internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this, device_id);
+    internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this);
     if (rank_ == 0) {
         output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this);
     }
@@ -1543,15 +1580,15 @@ void LlamaBatch<T>::OutputThreadEntry()
             }
             signals = std::move(output_signals_);
         }
-        if (rank_ == 0 && model_->ffi_lock_) {
-            model_->ffi_lock_(1);
+        if (rank_ == 0 && ffi_lock_) {
+            ffi_lock_(1);
         }
         // invoke stream cbs & signals
         for (const auto& s : signals) {
             s();
         }
-        if (rank_ == 0 && model_->ffi_lock_) {
-            model_->ffi_lock_(0);
+        if (rank_ == 0 && ffi_lock_) {
+            ffi_lock_(0);
         }
     }
 }
@@ -1760,6 +1797,101 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
     return true;
 }
 
+static inline Tensor slice(const Tensor& tensor, int index)
+{
+    auto shape = tensor.shape;
+    if (shape.at(0) == 1) {
+        return tensor;
+    }
+    shape[0]          = 1;
+    const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
+    return tensor.slice(shape, offset);
+}
+
+// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
+static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
+{
+    TensorMap dst;
+    for (const auto& kv : src) {
+        dst.insert({kv.first, slice(kv.second, index)});
+    }
+    return dst;
+}
+
+template<typename T>
+void LlamaBatch<T>::Submit(std::unordered_map<std::string, Tensor>*       outputs,
+                           const std::unordered_map<std::string, Tensor>* inputs,
+                           Control                                        control)
+{
+    if (debug_) {
+        for (const auto& kv : *inputs) {
+            TM_LOG_INFO("[Submit] INPUT: %s", format(kv).c_str());
+        }
+        for (const auto& kv : *outputs) {
+            TM_LOG_INFO("[Submit] OUTPUT: %s", format(kv).c_str());
+        }
+    }
+
+    const int batch_size = outputs->at("output_ids").shape[0];
+
+    std::vector<std::shared_ptr<Request>> requests(batch_size);
+
+    // allocates all requests for the batch
+    for (int i = 0; i < batch_size; ++i) {
+        requests[i] = std::make_shared<Request>();
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+        auto& r = requests[i];
+
+        r->inputs  = slice(*inputs, i);
+        r->outputs = slice(*outputs, i);
+
+        r->id         = r->inputs.getVal<uint64_t>("CORRID", i);
+        r->start_flag = r->inputs.getVal<int>("START", 1);
+        r->end_flag   = r->inputs.getVal<int>("END", 1);
+        r->stop_flag  = r->inputs.getVal<int>("STOP", 0);
+        r->stream_cb  = control.callback;
+    }
+
+    // Submits the tasks and wait for finish
+    std::vector<int> error_codes;
+    bool             has_error = 0;
+
+    TM_LOG_INFO("[forward] Enqueue requests");
+
+    std::vector<uint64_t> ids;
+    for (const auto& r : requests) {
+        ids.push_back(r->id);
+    }
+
+    auto futures = shared_state_->request_queue.enqueue(std::move(requests));
+
+    FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed");
+
+    TM_LOG_INFO("[forward] Wait for requests to complete ...");
+
+    for (int i = 0; i < futures.size(); ++i) {
+        auto ec = futures[i].get();
+        error_codes.push_back(ec);
+        if (ec) {
+            has_error = true;
+            TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec);
+        }
+        else {
+            TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]);
+        }
+    }
+
+    if (has_error) {
+        std::stringstream ss;
+        for (int i = 0; i < error_codes.size(); ++i) {
+            ss << (i ? "" : " ") << error_codes[i];
+        }
+        throw std::runtime_error(ss.str());
+    }
+}
+
 template class LlamaBatch<half>;
 #ifdef ENABLE_FP32
 template class LlamaBatch<float>;
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index f0345af6d2..236bf04e08 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -8,17 +8,35 @@
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/Request.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/instance_comm.h"
 #include <condition_variable>
 #include <mutex>
 #include <type_traits>
 
+using ffi_api_lock_ctrl_t = std::function<void(int)>;
+
 namespace turbomind {
 
+struct SharedState {
+    std::vector<std::shared_ptr<Request>> infer_requests;
+    std::vector<std::shared_ptr<Request>> stop_requests;
+    RequestQueue                          request_queue;
+    std::shared_ptr<Barrier>              barrier;
+    bool                                  abort;
+    std::atomic<size_t>                   free_size{std::numeric_limits<size_t>::max()};
+};
+
+struct Control {
+    AbstractInstanceComm* comm;
+    Request::Callback     callback;
+};
+
 struct BatchState {
     int*  h_prompt_length;  // history + input, ignore generated
     int*  h_context_length;
@@ -97,31 +115,37 @@ class LlamaBatch {
                              const std::vector<int>&             lengths,
                              const std::vector<const Sequence*>& sequences);
 
-    explicit LlamaBatch(const EngineParams& params, int cache_block_seq_len, int quant_policy, LlamaV2<T>* model);
+    explicit LlamaBatch(const EngineParam&           param,
+                        std::unique_ptr<LlamaV2<T>>  model,
+                        std::unique_ptr<Context<T>>  ctx,
+                        std::shared_ptr<SharedState> state,
+                        int                          device_id);
+
+    ~LlamaBatch();
 
-    ~LlamaBatch()
+    void Start();
+
+    void Submit(std::unordered_map<std::string, Tensor>*       outputs,
+                const std::unordered_map<std::string, Tensor>* inputs,
+                Control                                        control);
+
+    void set_ffi_lock(ffi_api_lock_ctrl_t func)
     {
-        TM_LOG_INFO("~LlamaBatch()");
-        model_->shared_state_->request_queue.close();
-
-        internal_thread_.join();
-
-        if (output_thread_.joinable()) {
-            {
-                std::lock_guard lock{output_mutex_};
-                output_stop_token_ = true;
-            }
-            output_cv_.notify_one();
-            output_thread_.join();
-        }
+        ffi_lock_ = func;
+    }
 
-        FreeBuffer();
+    LlamaV2<T>& model() noexcept
+    {
+        return *model_;
     }
 
-    void Start();
+    int session_len() const noexcept
+    {
+        return session_len_;
+    }
 
 private:
-    void InternalThreadEntry(int device_id);
+    void InternalThreadEntry();
 
     void OutputThreadEntry();
 
@@ -186,16 +210,30 @@ class LlamaBatch {
     }
 
 private:
-    const int  max_batch_size_;
-    const int  max_forward_token_num_;
-    const int  max_context_token_num_;
-    int        session_len_;
-    const int  rank_;
-    const bool debug_;
-    const int  step_length_;
-
-    LlamaV2<T>* const model_;
-
+    const EngineParam param_;
+
+    const std::shared_ptr<SharedState> shared_state_;
+
+    const int      max_batch_size_;
+    const int      max_forward_token_num_;
+    const int      max_context_token_num_;
+    const int      num_tokens_per_iter_;
+    const int      max_prefill_iters_;
+    const int      device_id_;
+    const int      rank_;
+    const DataType data_type_;
+    const bool     debug_;
+
+    // Refs into `Context<T>`
+    cudaStream_t const     stream_{};
+    cublasMMWrapper* const cublas_wrapper_{};
+    IAllocator* const      allocator_{};
+    IAllocator* const      peer_allocator_{};
+
+    int session_len_;  // May be truncated in ctor
+
+    std::unique_ptr<Context<T>>      context_;
+    std::unique_ptr<LlamaV2<T>>      model_;
     std::unique_ptr<SequenceManager> sequence_manager_;
 
     ///////////////////////////////////////////////////////////////////
@@ -275,8 +313,6 @@ class LlamaBatch {
     // hard limits for persistent buffers
     static constexpr int kMaxStopBadWordsLen = 32;
 
-    const DataType data_type_{};
-
     bool is_allocate_persistant_buffer_ = false;
     bool is_allocate_buffer_            = false;
 
@@ -285,11 +321,6 @@ class LlamaBatch {
 
     std::vector<std::tuple<std::string, std::byte*, std::byte*>> sampling_params_;
 
-    cudaStream_t     stream_{};
-    cublasMMWrapper* cublas_wrapper_{};
-    IAllocator*      allocator_{};
-    IAllocator*      peer_allocator_{};
-
     std::thread internal_thread_;
 
     // async stream callback utils
@@ -298,11 +329,12 @@ class LlamaBatch {
     std::condition_variable output_cv_;
     std::vector<Signal>     output_signals_;
     bool                    output_stop_token_{false};
+    ffi_api_lock_ctrl_t     ffi_lock_;
 
     int* h_output_ids_{};
-
-    const int num_tokens_per_iter_;
-    const int max_prefill_iters_;
 };
 
+template<class T>
+using Engine = LlamaBatch<T>;
+
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 68a2cf5ae1..c2f39a0962 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -61,7 +61,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
                                                     size_t     inter_size,
                                                     WeightType weight_type,
                                                     int        group_size,
-                                                    LoraParams lora_params,
+                                                    LoraParam  lora_param,
                                                     bool       attn_bias,
                                                     size_t     tensor_para_size,
                                                     size_t     tensor_para_rank):
@@ -75,7 +75,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank)
 {
-    if (lora_params.policy == LoraPolicy::kPlora) {
+    if (lora_param.policy == LoraPolicy::kPlora) {
         std::vector<std::string> keys = {
             "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
         std::vector<LlamaDenseWeight<T>*> weights = {&self_attn_weights.qkv,
@@ -86,18 +86,18 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
         for (int i = 0; i < keys.size(); i++) {
             const auto& name      = keys[i];
             auto&       weight    = *weights[i];
-            int         rank      = lora_params.r;
-            float       scale     = lora_params.scale;
+            int         rank      = lora_param.r;
+            float       scale     = lora_param.scale;
             std::string full_name = "layers." + std::to_string(layer_idx) + "." + name;
 
-            for (const auto& [re, pr] : lora_params.rank_pattern) {
+            for (const auto& [re, pr] : lora_param.rank_pattern) {
                 if (std::regex_search(full_name, pr.first)) {
                     rank = pr.second;
                     TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank);
                     break;
                 }
             }
-            for (const auto& [re, pr] : lora_params.scale_pattern) {
+            for (const auto& [re, pr] : lora_param.scale_pattern) {
                 if (std::regex_search(full_name, pr.first)) {
                     scale = pr.second;
                     TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale);
@@ -107,7 +107,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int        layer_idx,
             if (rank) {
                 weight.lora.r      = rank;
                 weight.lora.scale  = scale;
-                weight.lora.policy = lora_params.policy;
+                weight.lora.policy = lora_param.policy;
             }
         }
     }
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 07bc65cc5c..ea6a45b862 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -38,7 +38,7 @@ struct LlamaDecoderLayerWeight {
                             size_t     inter_size,
                             WeightType weight_type,
                             int        group_size,
-                            LoraParams lora_params,
+                            LoraParam  lora_param,
                             bool       attn_bias,
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank);
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 974b340ec1..b837c8b7b1 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -103,7 +103,7 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 
         const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
 
-        linear_.forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
+        linear_->forward(gating_buf_, ffn_input_data, num_token, weights->fused_gating_intermediate, type);
         sync_check_cuda_error();
 
         if (!weights->is_fused_silu) {
@@ -115,14 +115,14 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     else {
         {  // w1(x)
             NvtxScope scope("w1");
-            linear_.forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
+            linear_->forward(gating_buf_, ffn_input_data, num_token, weights->gating, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
         count_and_fix(gating_buf_, num_token * weights->gating.output_dims, Concat("w1", layer_id), 3);
 
         {  // w3(x)
             NvtxScope scope("w3");
-            linear_.forward(
+            linear_->forward(
                 inter_buf_, ffn_input_data, num_token, weights->intermediate, LlamaLinear<T>::kGemm, lora_mask);
             sync_check_cuda_error();
         }
@@ -137,7 +137,7 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
     {  // w2(x)
         NvtxScope scope("w2");
         const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size_ * 2 : 0;
-        linear_.forward(
+        linear_->forward(
             ffn_output_data, {gating_buf_, pitch}, num_token, weights->output, LlamaLinear<T>::kGemm, lora_mask);
         sync_check_cuda_error();
     }
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index db5a94380c..3ea7df0b20 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,9 +19,9 @@
 
 #pragma once
 
-// #include "src/turbomind/layers/FfnLayer.h"
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"
 #include <functional>
@@ -31,24 +31,14 @@ namespace turbomind {
 template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(size_t         head_num,
-                  size_t         size_per_head,
-                  size_t         hidden_units,
-                  size_t         inter_size,
-                  NcclParam      tensor_para,
-                  cudaStream_t   stream,
-                  LlamaLinear<T> linear,
-                  IAllocator*    allocator,
-                  bool           is_free_buffer_after_forward):
-        head_num_(head_num),
-        size_per_head_(size_per_head),
-        inter_size_(inter_size / tensor_para.world_size_),
-        hidden_units_(hidden_units),
-        stream_(stream),
-        linear_(linear),
-        allocator_(allocator),
-        tensor_para_(tensor_para),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+    LlamaFfnLayer(const ModelParam& model, const NcclParam& tp, const Context<T>& ctx):
+        inter_size_(model.inter_size / tp.world_size_),
+        hidden_units_(model.hidden_units),
+        tensor_para_(tp),
+        stream_(ctx.stream),
+        linear_(ctx.linear.get()),
+        allocator_(ctx.allocator.get())
+
     {
     }
 
@@ -66,20 +56,17 @@ class LlamaFfnLayer {
 
     void activation(int token_num, bool is_chunked);
 
-    size_t         head_num_;
-    size_t         size_per_head_;
-    size_t         inter_size_;
-    size_t         hidden_units_;
-    cudaStream_t   stream_;
-    LlamaLinear<T> linear_;
-    IAllocator*    allocator_;
-    bool           is_free_buffer_after_forward_;
+    const size_t          inter_size_;
+    const size_t          hidden_units_;
+    const NcclParam       tensor_para_;
+    cudaStream_t const    stream_;
+    LlamaLinear<T>* const linear_;
+    IAllocator* const     allocator_;
+    bool                  is_free_buffer_after_forward_{};
 
     T* gating_buf_{};
     T* inter_buf_{};
 
-    NcclParam tensor_para_;
-
     bool is_allocate_buffer_{};
 };
 
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index eab5a0cea6..a69df127de 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -21,6 +21,7 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
 
 #include "src/turbomind/models/llama/LlamaV2.h"
+#include "src/turbomind/kernels/attention/attention_params.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
 #include "src/turbomind/kernels/gemm/tuner/params.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
@@ -50,117 +51,66 @@
 
 namespace turbomind {
 
-template<typename T>
-LlamaV2<T>::LlamaV2(size_t                       head_num,
-                    size_t                       kv_head_num,
-                    size_t                       size_per_head,
-                    size_t                       hidden_units,
-                    size_t                       inter_size,
-                    size_t                       num_layer,
-                    size_t                       vocab_size,
-                    float                        norm_eps,
-                    const LlamaAttentionParams&  attn_params,
-                    int                          start_id,
-                    int                          end_id,
-                    int                          cache_block_seq_len,
-                    int                          quant_policy,
-                    bool                         use_context_fmha,
-                    const EngineParams&          engine_params,
-                    const LoraParams&            lora_params,
-                    std::shared_ptr<SharedState> shared_state,
-                    LlamaWeight<T>*              weights,
-                    NcclParam                    tensor_para,
-                    cudaStream_t                 stream,
-                    cublasMMWrapper*             cublas_wrapper,
-                    IAllocator*                  allocator,
-                    IAllocator*                  peer_alloctor,
-                    bool                         is_free_buffer_after_forward,
-                    cudaDeviceProp*              cuda_device_prop):
-    head_num_(head_num),
-    size_per_head_(size_per_head),
-    inter_size_(inter_size),
-    num_layer_(num_layer),
-    vocab_size_(vocab_size),
-    attn_params_(attn_params),
-    vocab_size_padded_(vocab_size),
-    rmsnorm_eps_(norm_eps),
-    start_id_(start_id),
-    end_id_(end_id),
-    hidden_units_(hidden_units),
-    local_head_num_(head_num / tensor_para.world_size_),
-    local_kv_head_num_(kv_head_num / tensor_para.world_size_),
-    weights_(weights),
-    tensor_para_(tensor_para),
-    stream_(stream),
-    cublas_wrapper_(cublas_wrapper),
-    allocator_(allocator),
-    peer_allcator_(peer_alloctor),
-    is_free_buffer_after_forward_(is_free_buffer_after_forward),
-    cuda_device_prop_(cuda_device_prop),
-    debug_(isDebug()),
-    linear_{cublas_wrapper, stream},
-    lora_params_(lora_params),
-    shared_state_(shared_state)
+/// TODO: Padded vocab size should also be divisible by 8
+inline int pad_vocab_size(int vocab_size, int tp)
+{
+    return (vocab_size + tp - 1) / tp * tp;
+}
 
+template<typename T>
+LlamaV2<T>::LlamaV2(const ModelParam&               model,
+                    const AttentionParam&           attn,
+                    const LoraParam&                lora,
+                    const NcclParam&                tp,
+                    const Context<T>&               ctx,
+                    int                             max_batch_size,
+                    std::shared_ptr<LlamaWeight<T>> weights):
+    param_(model),
+    attn_param_(attn),
+    lora_param_(lora),
+    head_num_(model.head_num),
+    size_per_head_(model.head_dim),
+    inter_size_(model.inter_size),
+    hidden_units_(model.hidden_units),
+    layer_num_(model.layer_num),
+    vocab_size_(model.vocab_size),
+    vocab_size_padded_(pad_vocab_size(model.vocab_size, tp.world_size_)),
+    rmsnorm_eps_(model.norm_eps),
+    start_id_(model.start_id),
+    end_id_(model.end_id),
+    tensor_para_(tp),
+    local_head_num_(model.head_num / tp.world_size_),
+    local_kv_head_num_(model.kv_head_num / tp.world_size_),
+    weights_(std::move(weights)),
+    stream_(ctx.stream),
+    cublas_wrapper_(ctx.cublas_wrapper.get()),
+    allocator_(ctx.allocator.get()),
+    peer_allcator_(ctx.peer_allocator.get()),
+    linear_(ctx.linear.get()),
+    is_free_buffer_after_forward_(false),
+    debug_(isDebug())
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
 
-    vocab_size_padded_ =
-        (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
+    unified_decoder_ = std::make_unique<UnifiedDecoder<T>>(model, attn, lora, tp, ctx);
 
-    batch_ = std::make_unique<LlamaBatch<T>>(engine_params, cache_block_seq_len, quant_policy, this);
+    dynamic_decode_layer_ = std::make_unique<DynamicDecodeLayer<float>>(vocab_size_,
+                                                                        vocab_size_padded_,
+                                                                        0,  // end_id, deprecated
+                                                                        stream_,
+                                                                        cublas_wrapper_,
+                                                                        allocator_,
+                                                                        is_free_buffer_after_forward_,
+                                                                        (cudaDeviceProp*)&ctx.cuda_device_prop);
 
-    initialize(attn_params, kv_head_num, use_context_fmha, cache_block_seq_len, quant_policy);
-
-    unified_decoder_->allocateBuffer(engine_params.max_batch_size);
-
-    /// TODO: decouple Llama model and batch inference
-    batch_->Start();
+    unified_decoder_->allocateBuffer(max_batch_size);
 }
 
 template<typename T>
 LlamaV2<T>::~LlamaV2()
 {
+    dynamic_decode_layer_.reset();
     unified_decoder_.reset();
-    delete dynamic_decode_layer_;
-}
-
-template<typename T>
-void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
-                            size_t                      kv_head_num,
-                            bool                        use_context_fmha,
-                            int                         cache_block_seq_len,
-                            int                         quant_policy)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    unified_decoder_.reset(new UnifiedDecoder<T>(head_num_,
-                                                 kv_head_num,
-                                                 size_per_head_,
-                                                 hidden_units_,
-                                                 inter_size_,
-                                                 num_layer_,
-                                                 attn_params,
-                                                 rmsnorm_eps_,
-                                                 tensor_para_,
-                                                 stream_,
-                                                 linear_,
-                                                 allocator_,
-                                                 lora_params_,
-                                                 is_free_buffer_after_forward_,
-                                                 use_context_fmha,
-                                                 cache_block_seq_len,
-                                                 quant_policy));
-
-    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
-                                                          vocab_size_padded_,
-                                                          0,  // end_id, deprecated
-                                                          stream_,
-                                                          cublas_wrapper_,
-                                                          allocator_,
-                                                          is_free_buffer_after_forward_,
-                                                          cuda_device_prop_);
 }
 
 template<typename T>
@@ -441,108 +391,6 @@ void LlamaV2<T>::dynamicDecode(int*            token_ids,
     dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
 }
 
-static inline Tensor slice(const Tensor& tensor, int index)
-{
-    auto shape = tensor.shape;
-    if (shape.at(0) == 1) {
-        return tensor;
-    }
-    shape[0]          = 1;
-    const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
-    return tensor.slice(shape, offset);
-}
-
-// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
-static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
-{
-    TensorMap dst;
-    for (const auto& kv : src) {
-        dst.insert({kv.first, slice(kv.second, index)});
-    }
-    return dst;
-}
-
-template<typename T>
-void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
-                         const std::unordered_map<std::string, Tensor>* inputs,
-                         Control                                        control)
-{
-    if (debug_) {
-        if (tensor_para_.rank_ == 0) {
-            for (const auto& kv : *inputs) {
-                TM_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
-            }
-            for (const auto& kv : *outputs) {
-                TM_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
-            }
-        }
-    }
-
-    const int batch_size = outputs->at("output_ids").shape[0];
-
-    const auto rank = tensor_para_.rank_;
-    FT_CHECK(rank == 0);
-
-    std::vector<std::shared_ptr<Request>> requests(batch_size);
-
-    // allocates all requests for the batch
-    for (int i = 0; i < batch_size; ++i) {
-        requests[i] = std::make_shared<Request>();
-    }
-
-    for (int i = 0; i < batch_size; ++i) {
-        auto& r = requests[i];
-
-        r->inputs  = slice(*inputs, i);
-        r->outputs = slice(*outputs, i);
-
-        if (rank == 0) {
-            r->id         = r->inputs.getVal<uint64_t>("CORRID", i);
-            r->start_flag = r->inputs.getVal<int>("START", 1);
-            r->end_flag   = r->inputs.getVal<int>("END", 1);
-            r->stop_flag  = r->inputs.getVal<int>("STOP", 0);
-            r->stream_cb  = control.callback;
-        }
-    }
-
-    // Submits the tasks and wait for finish
-    std::vector<int> error_codes;
-    bool             has_error = 0;
-
-    TM_LOG_INFO("[forward] Enqueue requests");
-
-    std::vector<uint64_t> ids;
-    for (const auto& r : requests) {
-        ids.push_back(r->id);
-    }
-
-    auto futures = shared_state_->request_queue.enqueue(std::move(requests));
-
-    FT_CHECK_WITH_INFO(ids.size() == futures.size(), "check failed");
-
-    TM_LOG_INFO("[forward] Wait for requests to complete ...");
-
-    for (int i = 0; i < futures.size(); ++i) {
-        auto ec = futures[i].get();
-        error_codes.push_back(ec);
-        if (ec) {
-            has_error = true;
-            TM_LOG_WARNING("[forward] Request failed for %ld, code %d", (long)ids[i], (int)ec);
-        }
-        else {
-            TM_LOG_INFO("[forward] Request completed for %ld", (long)ids[i]);
-        }
-    }
-
-    if (has_error) {
-        std::stringstream ss;
-        for (int i = 0; i < error_codes.size(); ++i) {
-            ss << (i ? "" : " ") << error_codes[i];
-        }
-        throw std::runtime_error(ss.str());
-    }
-}
-
 template<class First, class Last>
 static std::string Join(First first, Last last, const std::string& delim)
 {
@@ -564,12 +412,12 @@ void LlamaV2<T>::tune()
 
     if (auto str = std::getenv("TM_GEMM_IMPORT")) {
         std::ifstream ifs(str);
-        const int     n_imported = linear_.Import(ifs);
+        const int     n_imported = linear_->Import(ifs);
         TM_LOG_INFO("[Gemm2] %d records imported", n_imported);
         return;
     }
 
-    std::vector<int> bss = linear_.GetTuningSeq();
+    std::vector<int> bss = linear_->GetTuningSeq();
     if (bss.empty()) {
         bss = gemm::GenerateTuningSequence(gemm::GetDefaultTuningGenerators());
     }
@@ -609,16 +457,16 @@ void LlamaV2<T>::tune()
     T* out_data = (T*)allocator_->malloc(sizeof(T) * (size_t)max_bs * max_out);
 
     cudaRandomUniform(in_data, (size_t)max_bs * max_in);
-    cudaDeviceSynchronize();
+    check_cuda_error(cudaDeviceSynchronize());
 
-    linear_.set_measure(true);
+    linear_->set_measure(true);
 
     auto tick = std::chrono::steady_clock::now();
 
     for (auto bs : bss) {
         TM_LOG_INFO("[Gemm2] %d", bs);
         for (auto& w : weights) {
-            linear_.forward(out_data, in_data, bs, *w);
+            linear_->forward(out_data, in_data, bs, *w);
         }
     }
 
@@ -627,7 +475,9 @@ void LlamaV2<T>::tune()
     TM_LOG_INFO("[Gemm2] Tuning finished in %.2f seconds.",
                 std::chrono::duration<float, std::ratio<1, 1>>(tock - tick).count());
 
-    linear_.set_measure(false);
+    linear_->set_measure(false);
+
+    check_cuda_error(cudaDeviceSynchronize());
 
     allocator_->free((void**)&in_data);
     allocator_->free((void**)&out_data);
@@ -636,7 +486,7 @@ void LlamaV2<T>::tune()
     if (tensor_para_.rank_ == 0) {
         if (auto path = std::getenv("TM_GEMM_EXPORT")) {
             std::ofstream ofs(path);
-            const auto    n_records = linear_.Export(ofs);
+            const auto    n_records = linear_->Export(ofs);
             TM_LOG_INFO("[Gemm2] %d records exported.", n_records);
         }
     }
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index b0a19f4239..f746c3f625 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -36,82 +36,29 @@
 #include <limits>
 #include <unordered_map>
 
-using ffi_api_lock_ctrl_t = std::function<void(int)>;
-
 namespace turbomind {
 
 template<typename T>
 class LlamaV2 {
 public:
-    struct SharedState {
-        std::vector<std::shared_ptr<Request>> infer_requests;
-        std::vector<std::shared_ptr<Request>> stop_requests;
-        RequestQueue                          request_queue;
-        std::shared_ptr<Barrier>              barrier;
-        bool                                  abort;
-        std::atomic<size_t>                   free_size{std::numeric_limits<size_t>::max()};
-    };
-
     ~LlamaV2();
 
-    LlamaV2(size_t                       head_num,
-            size_t                       kv_head_num,
-            size_t                       size_per_head,
-            size_t                       hidden_units,
-            size_t                       inter_size,
-            size_t                       num_layer,
-            size_t                       vocab_size,
-            float                        norm_eps,
-            const LlamaAttentionParams&  attn_params,
-            int                          start_id,
-            int                          end_id,
-            int                          cache_block_seq_len,
-            int                          quant_policy,
-            bool                         use_context_fmha,
-            const EngineParams&          engine_params,
-            const LoraParams&            lora_params,
-            std::shared_ptr<SharedState> shared_state,
-            LlamaWeight<T>*              weights,
-            NcclParam                    tensor_para,
-            cudaStream_t                 stream,
-            cublasMMWrapper*             cublas_wrapper,
-            IAllocator*                  allocator,
-            IAllocator*                  peer_allocator,
-            bool                         is_free_buffer_after_forward,
-            cudaDeviceProp*              cuda_device_prop);
-
-    struct Control {
-        AbstractInstanceComm* comm;
-        Request::Callback     callback;
-    };
-
-    void forward(std::unordered_map<std::string, Tensor>*       outputs,
-                 const std::unordered_map<std::string, Tensor>* inputs,
-                 Control                                        control);
+    LlamaV2(const ModelParam&               model,
+            const AttentionParam&           attn,
+            const LoraParam&                lora,
+            const NcclParam&                tp,
+            const Context<T>&               ctx,
+            int                             max_batch_size,
+            std::shared_ptr<LlamaWeight<T>> weights);
 
     void tune();
 
-    void stop(const std::vector<uint64_t>& seq_ids);
-
     size_t vocab_size() const noexcept
     {
         return vocab_size_;
     }
 
-    void setFfiLock(ffi_api_lock_ctrl_t func)
-    {
-        ffi_lock_ = func;
-    }
-
 private:
-    friend class Batch;
-
-    void initialize(const LlamaAttentionParams& attn_params,
-                    size_t                      kv_head_num,
-                    bool                        use_context_fmha,
-                    int                         cache_block_seq_len,
-                    int                         quant_policy);
-
     void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
 
     void updateEmbedding(T*               decoder_input,
@@ -160,46 +107,38 @@ class LlamaV2 {
 private:
     friend class LlamaBatch<T>;
 
-    const size_t head_num_;
-    const size_t size_per_head_;
-    const size_t inter_size_;
-    const size_t num_layer_;
-    const size_t vocab_size_;
-    size_t       vocab_size_padded_;
-    float        rmsnorm_eps_ = 1e-6f;
-
-    const LlamaAttentionParams attn_params_;
-
-    static constexpr bool neox_rotary_style_ = false;
-
-    const int    start_id_;
-    const int    end_id_;
-    const size_t hidden_units_;
-
-    const size_t local_head_num_;
-    const size_t local_kv_head_num_;
-    NcclParam    tensor_para_;
-
-    cudaStream_t     stream_;
-    cublasMMWrapper* cublas_wrapper_;
-    IAllocator*      allocator_;
-    IAllocator*      peer_allcator_;
-    bool             is_free_buffer_after_forward_;
-    cudaDeviceProp*  cuda_device_prop_;
-
-    const bool debug_{false};
-
-    LlamaWeight<T>* weights_{};
-
-    LlamaLinear<T> linear_;
-
-    std::unique_ptr<UnifiedDecoder<T>> unified_decoder_;
-    DynamicDecodeLayer<float>*         dynamic_decode_layer_{};
-
-    std::shared_ptr<SharedState>   shared_state_;
-    ffi_api_lock_ctrl_t            ffi_lock_;
-    std::unique_ptr<LlamaBatch<T>> batch_;
-    LoraParams                     lora_params_;
+    const ModelParam     param_;
+    const AttentionParam attn_param_;
+    const LoraParam      lora_param_;
+
+    const size_t    head_num_;
+    const size_t    size_per_head_;
+    const size_t    hidden_units_;
+    const size_t    inter_size_;
+    const size_t    layer_num_;
+    const size_t    vocab_size_;
+    const size_t    vocab_size_padded_;
+    const float     rmsnorm_eps_;
+    const int       start_id_;
+    const int       end_id_;
+    const NcclParam tensor_para_;
+    const size_t    local_head_num_;
+    const size_t    local_kv_head_num_;
+
+    const std::shared_ptr<LlamaWeight<T>> weights_{};
+
+    // Refs into `Context<T>`, make the pointer constant (not the pointed objects)
+    cudaStream_t const     stream_;
+    cublasMMWrapper* const cublas_wrapper_;
+    IAllocator* const      allocator_;
+    IAllocator* const      peer_allcator_;
+    LlamaLinear<T>* const  linear_;
+
+    const bool is_free_buffer_after_forward_;
+    const bool debug_;
+
+    std::unique_ptr<UnifiedDecoder<T>>         unified_decoder_;
+    std::unique_ptr<DynamicDecodeLayer<float>> dynamic_decode_layer_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 18ecc2507d..1b1172f513 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -35,7 +35,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                             bool       attn_bias,
                             WeightType weight_type,
                             int        group_size,
-                            LoraParams lora_params,
+                            LoraParam  lora_param,
                             size_t     tensor_para_size,
                             size_t     tensor_para_rank):
     hidden_units_(hidden_units),
@@ -61,7 +61,7 @@ LlamaWeight<T>::LlamaWeight(size_t     head_num,
                                                                        inter_size_,
                                                                        weight_type_,
                                                                        group_size,
-                                                                       lora_params,
+                                                                       lora_param,
                                                                        attn_bias,
                                                                        tensor_para_size_,
                                                                        tensor_para_rank_));
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index f71e03715a..8c94925ce7 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -39,7 +39,7 @@ struct LlamaWeight {
                 bool       attn_bias,
                 WeightType weight_type,
                 int        group_size,
-                LoraParams lora_params,
+                LoraParam  lora_param,
                 size_t     tensor_para_size,
                 size_t     tensor_para_rank);
 
diff --git a/src/turbomind/models/llama/context.h b/src/turbomind/models/llama/context.h
new file mode 100644
index 0000000000..bbdab8c6bd
--- /dev/null
+++ b/src/turbomind/models/llama/context.h
@@ -0,0 +1,25 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/utils/allocator.h"
+#include "src/turbomind/utils/cublasMMWrapper.h"
+#include <cuda_runtime.h>
+#include <memory>
+
+namespace turbomind {
+
+template<class T>
+struct Context {
+    cudaStream_t                                    stream;
+    std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator;
+    std::unique_ptr<Allocator<AllocatorType::CUDA>> peer_allocator;
+    std::unique_ptr<cublasAlgoMap>                  cublas_algo_map;
+    std::unique_ptr<std::mutex>                     cublas_wrapper_mutex;
+    std::unique_ptr<cublasMMWrapper>                cublas_wrapper;
+    std::unique_ptr<LlamaLinear<T>>                 linear;
+    cudaDeviceProp                                  cuda_device_prop;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index 30d95ed53c..aa6076de6f 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -3,13 +3,29 @@
 #pragma once
 
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include <cstddef>
 #include <map>
 #include <regex>
 #include <string>
 
 namespace turbomind {
 
-struct LlamaAttentionParams {
+struct ModelParam {
+    size_t head_num;
+    size_t head_dim;
+    size_t kv_head_num;
+    size_t hidden_units;
+    size_t layer_num;
+    size_t inter_size;
+    size_t vocab_size;
+    float  norm_eps;
+    int    quant_policy;
+    //
+    int start_id;
+    int end_id;
+};
+
+struct AttentionParam {
     int         rotary_embedding_dim;
     float       rotary_embedding_base;
     int         max_position_embeddings;
@@ -20,9 +36,10 @@ struct LlamaAttentionParams {
     float       high_freq_factor;
     bool        use_dynamic_ntk;
     bool        use_logn_attn;
+    int         cache_block_seq_len;
 };
 
-struct EngineParams {
+struct EngineParam {
     // batch params
     int max_batch_size;
     int session_len;
@@ -40,7 +57,7 @@ struct EngineParams {
     int max_prefill_iters;
 };
 
-struct LoraParams {
+struct LoraParam {
     int        r;
     float      scale;
     LoraPolicy policy;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 05fff513be..cdd27a5c60 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -37,8 +37,41 @@
 
 namespace turbomind {
 
-template<typename T>
+template<class T>
+UnifiedAttentionLayer<T>::UnifiedAttentionLayer(const ModelParam&     model,
+                                                const AttentionParam& attn,
+                                                const LoraParam&      lora,
+                                                const NcclParam&      tp,
+                                                const Context<T>&     ctx):
+    head_num_(model.head_num),
+    kv_head_num_(model.kv_head_num),
+    size_per_head_(model.head_dim),
+    hidden_units_(model.hidden_units),
+    local_head_num_(head_num_ / tp.world_size_),
+    local_kv_head_num_(model.kv_head_num / tp.world_size_),
+    param_(attn),
+    model_param_(model),
+    lora_param_(lora),
+    tensor_para_(tp),
+    context_(ctx),
+    stream_(ctx.stream),
+    linear_(ctx.linear.get()),
+    allocator_(ctx.allocator.get()),
+    arch_(getSMVersion())
+{
+    FT_CHECK(head_num_ % kv_head_num_ == 0);
+
+    check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking));
+    check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming));
+    check_cuda_error(cudaEventCreateWithFlags(&aux_event_, cudaEventDisableTiming));
+
+    streams_[0] = stream_;
+    streams_[1] = aux_stream_;
 
+    allocateWorkspace();
+}
+
+template<typename T>
 void UnifiedAttentionLayer<T>::allocateBuffer(size_t            q_count,
                                               size_t            k_count,
                                               size_t            batch_size,
@@ -168,7 +201,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
                    weights);
 
     // [L, 2, H, s, D]
-    const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
+    const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
     static int count = 0;
 
@@ -180,7 +213,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     //////////////////////////////////////////////
     /// qkv gemm
     // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-    linear_.forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
+    linear_->forward(qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
     count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
@@ -241,7 +274,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.block_iter_params = BlockIteratorParams{(char**)block_ptrs,  //
                                                        (int*)cu_block_count + offset,
                                                        layer_id,
-                                                       (int)kv_cache_block_len_};
+                                                       (int)param_.cache_block_seq_len};
 
         // Prefilling use only
         const int sum_k_len       = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset];
@@ -260,22 +293,22 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         // MSVC does not have M_LOG2E
         params.inv_sqrt_dh = (float)std::log2(expf(1.)) / std::sqrt((float)params.size_per_head);
 
-        params.rotary_embedding_dim    = params_.rotary_embedding_dim;
-        params.rotary_embedding_base   = params_.rotary_embedding_base;
-        params.max_position_embeddings = params_.max_position_embeddings;
+        params.rotary_embedding_dim    = param_.rotary_embedding_dim;
+        params.rotary_embedding_base   = param_.rotary_embedding_base;
+        params.max_position_embeddings = param_.max_position_embeddings;
         params.rope_ti_scale           = 1.f;
-        if (params_.rope_scaling_type == "linear") {
-            params.rope_ti_scale /= params_.rope_scaling_factor;
+        if (param_.rope_scaling_type == "linear") {
+            params.rope_ti_scale /= param_.rope_scaling_factor;
         }
-        if (params_.rope_scaling_type == "llama3") {
+        if (param_.rope_scaling_type == "llama3") {
             const double PI                   = 3.14159265358979323846;
-            float        inv_diff_freq_factor = 1.0 / (params_.high_freq_factor - params_.low_freq_factor);
-            params.llama3_inv_scaling_factor  = 1.0 / params_.rope_scaling_factor;
-            params.llama3_alpha = params_.original_max_position_embeddings / (2 * PI) * inv_diff_freq_factor;
-            params.llama3_beta  = params_.low_freq_factor * inv_diff_freq_factor;
+            float        inv_diff_freq_factor = 1.0 / (param_.high_freq_factor - param_.low_freq_factor);
+            params.llama3_inv_scaling_factor  = 1.0 / param_.rope_scaling_factor;
+            params.llama3_alpha = param_.original_max_position_embeddings / (2 * PI) * inv_diff_freq_factor;
+            params.llama3_beta  = param_.low_freq_factor * inv_diff_freq_factor;
         }
 
-        params.use_logn_attn = params_.use_logn_attn;
+        params.use_logn_attn = param_.use_logn_attn;
 
         // Decoding use only for now
         FT_CHECK(barriers_);
@@ -289,7 +322,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.arch   = arch_;
         params.stream = stream;
 
-        params.quant_policy = quant_policy_;
+        params.quant_policy = model_param_.quant_policy;
         return params;
     };
 
@@ -344,7 +377,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
 
     //////////////////////////////////////////////
     /// output gemm <Bs,HD> -> <Bs,HD>
-    linear_.forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
+    linear_->forward(attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_mask);
     sync_check_cuda_error();
 
     // ++count;
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 58bba45896..19aa08c29f 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -23,10 +23,12 @@
 
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"
+#include <cuda_runtime_api.h>
 
 namespace turbomind {
 
@@ -48,50 +50,25 @@ class UnifiedAttentionLayer {
     {
         freeBuffer();
         freeWorkspace();
-    }
-
-    UnifiedAttentionLayer(size_t               head_num,
-                          size_t               kv_head_num,
-                          size_t               size_per_head,
-                          size_t               hidden_units,
-                          LlamaAttentionParams attn_params,
-                          NcclParam            tensor_para,
-                          LoraParams           lora_params,
-                          cudaStream_t         stream,
-                          LlamaLinear<T>       linear,
-                          IAllocator*          allocator,
-                          bool                 is_free_buffer_after_forward,
-                          int                  cache_block_seq_len,
-                          int                  quant_policy):
-        head_num_(head_num),
-        size_per_head_(size_per_head),
-        hidden_units_(hidden_units),
-        local_head_num_(head_num / tensor_para.world_size_),
-        local_kv_head_num_(kv_head_num / tensor_para.world_size_),
-        head_n_rep_(head_num / kv_head_num),
-        params_(attn_params),
-        tensor_para_(tensor_para),
-        lora_params_(lora_params),
-        stream_(stream),
-        linear_(linear),
-        allocator_(allocator),
-        kv_cache_block_len_(cache_block_seq_len),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward),
-        quant_policy_(quant_policy)
-    {
-        FT_CHECK(head_num % kv_head_num == 0);
-        arch_ = getSMVersion();
 
-        check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking));
-        check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming));
-        check_cuda_error(cudaEventCreateWithFlags(&aux_event_, cudaEventDisableTiming));
+        for (auto& s : streams_) {
+            s = {};
+        }
 
-        streams_[0] = stream_;
-        streams_[1] = aux_stream_;
+        check_cuda_error(cudaEventDestroy(aux_event_));
+        check_cuda_error(cudaEventDestroy(qkv_event_));
+        check_cuda_error(cudaStreamDestroy(aux_stream_));
 
-        allocateWorkspace();
+        aux_event_ = qkv_event_ = {};
+        aux_stream_             = {};
     }
 
+    UnifiedAttentionLayer(const ModelParam&     model,
+                          const AttentionParam& attn,
+                          const LoraParam&      lora,
+                          const NcclParam&      tp,
+                          const Context<T>&     context);
+
     void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
 
     void prefill(T*                output,
@@ -131,25 +108,24 @@ class UnifiedAttentionLayer {
 
 private:
     const size_t head_num_;
+    const size_t kv_head_num_;
     const size_t size_per_head_;
     const size_t hidden_units_;
-    const size_t local_kv_head_num_;
     const size_t local_head_num_;
-    const size_t head_n_rep_;
-    const size_t kv_cache_block_len_;
-    const bool   is_free_buffer_after_forward_;
-
-    const LlamaAttentionParams params_;
-
-    const int quant_policy_;
+    const size_t local_kv_head_num_;
 
-    NcclParam tensor_para_;
+    const AttentionParam param_;
+    const ModelParam     model_param_;
+    const LoraParam      lora_param_;
+    const NcclParam      tensor_para_;
+    const Context<T>&    context_;
 
-    LoraParams lora_params_;
+    cudaStream_t const    stream_;
+    LlamaLinear<T>* const linear_;
+    IAllocator* const     allocator_;
+    const int             arch_{};
 
-    cudaStream_t   stream_;
-    IAllocator*    allocator_;
-    LlamaLinear<T> linear_;
+    const bool is_free_buffer_after_forward_{false};
 
     cudaStream_t aux_stream_;
     cudaEvent_t  qkv_event_;
@@ -157,8 +133,6 @@ class UnifiedAttentionLayer {
 
     std::array<cudaStream_t, 2> streams_;
 
-    int arch_{};
-
     T*     qkv_buf_{};
     T*     q_buf_2_{};
     T*     k_buf_2_{};
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index db9482fb48..e1fa9efbde 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -10,13 +10,39 @@
 
 namespace turbomind {
 
+template<class T>
+UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
+                                  const AttentionParam& attn,
+                                  const LoraParam&      lora,
+                                  const NcclParam&      tp,
+                                  const Context<T>&     ctx):
+    layer_num_(model.layer_num),
+    hidden_units_(model.hidden_units),
+    rmsnorm_eps_(model.norm_eps),
+    stream_(ctx.stream),
+    allocator_(ctx.allocator.get()),
+    dtype_(getTensorType<T>())
+{
+
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, tp, ctx);
+    ffn_layer_  = std::make_unique<LlamaFfnLayer<T>>(model, tp, ctx);
+
+    check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
+}
+
+template<typename T>
+UnifiedDecoder<T>::~UnifiedDecoder()
+{
+    freeBuffer();
+    check_cuda_error(cudaEventDestroy(ev_h_cu_x_));
+}
+
 template<typename T>
 void UnifiedDecoder<T>::allocateBuffer(size_t batch_size)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
-    cu_q_len_ = (int*)allocator_->reMalloc(cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false);
-
+    cu_q_len_   = (int*)allocator_->reMalloc(cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false);
     h_cu_q_len_ = (int*)allocator_->reMalloc(h_cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false, true);
 }
 
@@ -26,43 +52,9 @@ void UnifiedDecoder<T>::freeBuffer()
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
     allocator_->free((void**)&cu_q_len_);
-
     allocator_->free((void**)&h_cu_q_len_, true);
 }
 
-template<typename T>
-void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
-                                   size_t                      kv_head_num,
-                                   int                         cache_block_seq_len,
-                                   int                         quant_policy)
-{
-    attn_layer_ = new UnifiedAttentionLayer<T>(head_num_,
-                                               kv_head_num,
-                                               size_per_head_,
-                                               hidden_units_,
-                                               attn_params,
-                                               tensor_para_,
-                                               lora_params_,
-                                               stream_,
-                                               linear_,
-                                               allocator_,
-                                               is_free_buffer_after_forward_,
-                                               cache_block_seq_len,
-                                               quant_policy);
-
-    ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
-                                      size_per_head_,
-                                      hidden_units_,
-                                      inter_size_,
-                                      tensor_para_,
-                                      stream_,
-                                      linear_,
-                                      allocator_,
-                                      is_free_buffer_after_forward_);
-
-    check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
-}
-
 template<typename T>
 void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
                                         TensorMap*                     _outputs,
@@ -86,15 +78,6 @@ void UnifiedDecoder<T>::forwardSelfAttn(T*                             attn_io,
     attn_layer_->forward(&outputs, &inputs, weight);
 }
 
-template<typename T>
-UnifiedDecoder<T>::~UnifiedDecoder()
-{
-    check_cuda_error(cudaEventDestroy(ev_h_cu_x_));
-    delete attn_layer_;
-    delete ffn_layer_;
-    freeBuffer();
-}
-
 template<typename T>
 void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights)
 {
@@ -166,7 +149,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
     count_and_fix(decoder_output, token_num * hidden_units_, Concat("norm0", 0), 2);
 
-    for (size_t layer = 0; layer < num_layer_; ++layer) {
+    for (size_t layer = 0; layer < layer_num_; ++layer) {
 
         // Compare(decoder_output, token_num * hidden_units_, "attn_input", kCmpRead, stream_);
 
@@ -209,7 +192,7 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         count_and_fix(decoder_output, token_num * hidden_units_, Concat("ffn_block", layer), 2);
 
-        const bool is_last_layer = layer == num_layer_ - 1;
+        const bool is_last_layer = layer == layer_num_ - 1;
 
         auto scale_weight = !is_last_layer ? weights->at(layer + 1)->self_attn_norm_weights :
                                              inputs->at("output_norm_weight").getPtr<T>();
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index b2acbe1b44..95a36d2a8b 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -2,6 +2,7 @@
 
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
@@ -12,27 +13,16 @@ namespace turbomind {
 
 template<typename T>
 class UnifiedDecoder {
-protected:
+private:
     void freeBuffer();
 
-    void
-    initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);
-
-    cudaStream_t   stream_;
-    LlamaLinear<T> linear_;
-    IAllocator*    allocator_;
-    bool           is_free_buffer_after_forward_{};
-
-    size_t head_num_;
-    size_t size_per_head_;
-    size_t inter_size_;
-    size_t num_layer_;
-    size_t hidden_units_;
-    float  rmsnorm_eps_;
-
-    NcclParam tensor_para_;
-
-    LoraParams lora_params_;
+    const size_t       layer_num_;
+    const size_t       hidden_units_;
+    const float        rmsnorm_eps_;
+    cudaStream_t const stream_;
+    IAllocator* const  allocator_;
+    const DataType     dtype_;
+    bool               is_free_buffer_after_forward_{};
 
     int* cu_q_len_{};
     int* cu_k_len_{};
@@ -40,15 +30,11 @@ class UnifiedDecoder {
     int* h_cu_q_len_{};
     int* h_cu_k_len_{};
 
-    UnifiedAttentionLayer<T>* attn_layer_{};
-    LlamaFfnLayer<T>*         ffn_layer_{};
+    std::unique_ptr<UnifiedAttentionLayer<T>> attn_layer_;
+    std::unique_ptr<LlamaFfnLayer<T>>         ffn_layer_;
 
     cudaEvent_t ev_h_cu_x_{};
 
-    const DataType dtype_;
-
-    // bool need_causal_mask_{false};
-
     using WeightType = LlamaDecoderLayerWeight<T>;
 
     void forwardSelfAttn(T*                             attn_io,
@@ -60,39 +46,11 @@ class UnifiedDecoder {
                          const LlamaAttentionWeight<T>* weight);
 
 public:
-    UnifiedDecoder(size_t                      head_num,
-                   size_t                      kv_head_num,
-                   size_t                      size_per_head,
-                   size_t                      hidden_units,
-                   size_t                      inter_size,
-                   size_t                      num_layer,
-                   const LlamaAttentionParams& attn_params,
-                   float                       rmsnorm_eps,
-                   NcclParam                   tensor_para,
-                   cudaStream_t                stream,
-                   LlamaLinear<T>              linear,
-                   IAllocator*                 allocator,
-                   const LoraParams&           lora_params,
-                   bool                        is_free_buffer_after_forward,
-                   bool                        use_fmha,
-                   int                         cache_block_seq_len,
-                   int                         quant_policy):
-        stream_(stream),
-        linear_(linear),
-        allocator_(allocator),
-        lora_params_(lora_params),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward),
-        head_num_(head_num),
-        size_per_head_(size_per_head),
-        inter_size_(inter_size),
-        hidden_units_(hidden_units),
-        num_layer_(num_layer),
-        rmsnorm_eps_(rmsnorm_eps),
-        tensor_para_(tensor_para),
-        dtype_(getTensorType<T>())
-    {
-        initialize(attn_params, kv_head_num, cache_block_seq_len, quant_policy);
-    }
+    UnifiedDecoder(const ModelParam&     model,
+                   const AttentionParam& attn,
+                   const LoraParam&      lora,
+                   const NcclParam&      tp,
+                   const Context<T>&     ctx);
 
     void allocateBuffer(size_t max_batch_size);
 
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 973c023725..a716f8a49f 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -376,14 +376,14 @@ PYBIND11_MODULE(_turbomind, m)
                 if (data_type == "half" || data_type == "fp16" || data_type == "int4") {
                     auto model = std::make_shared<LlamaTritonModel<half>>(
                         tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->setFfiLock(gil_control);
+                    model->set_ffi_lock(gil_control);
                     return model;
                 }
                 else if (data_type == "bf16") {
 #ifdef ENABLE_BF16
                     auto model = std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
                         tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->setFfiLock(gil_control);
+                    model->set_ffi_lock(gil_control);
                     return model;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with bf16 support.");
@@ -393,7 +393,7 @@ PYBIND11_MODULE(_turbomind, m)
 #ifdef ENABLE_FP32
                     auto model = std::make_shared<LlamaTritonModel<float>>(
                         tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
-                    model->setFfiLock(gil_control);
+                    model->set_ffi_lock(gil_control);
                     return model;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with fp32 support.");
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index cb9ea29f48..58fec72e88 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -22,10 +22,13 @@
 #include "3rdparty/INIReader.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaInstanceComm.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
+#include <cuda_runtime.h>
 #include <mutex>
 
 namespace ft = turbomind;
@@ -94,70 +97,84 @@ std::map<std::string, std::pair<std::regex, T>> getLoraPattern(std::string patte
 template<typename T>
 void LlamaTritonModel<T>::handleMissingParams()
 {
-    if (kv_head_num_ == 0) {
-        kv_head_num_ = head_num_;
-        TM_LOG_WARNING("[LlamaTritonModel] `kv_head_num` is not set, default to `head_num` (%d).", (int)kv_head_num_);
+    if (model_param_.kv_head_num == 0) {
+        model_param_.kv_head_num = model_param_.head_num;
+        TM_LOG_WARNING("[LlamaTritonModel] `kv_head_num` is not set, default to `head_num` (%d).",
+                       (int)model_param_.kv_head_num);
     }
 
-    if (!attn_params_.max_position_embeddings) {
-        attn_params_.max_position_embeddings = 2048;
+    if (!attn_param_.max_position_embeddings) {
+        attn_param_.max_position_embeddings = 2048;
         TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
-                       (int)attn_params_.max_position_embeddings);
+                       (int)attn_param_.max_position_embeddings);
     }
 
-    if (!engine_params_.max_batch_size) {
-        engine_params_.max_batch_size = 64;
+    if (!engine_param_.max_batch_size) {
+        engine_param_.max_batch_size = 64;
         TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.",
-                       (int)engine_params_.max_batch_size);
+                       (int)engine_param_.max_batch_size);
     }
 
-    if (!engine_params_.session_len) {
-        engine_params_.session_len = attn_params_.max_position_embeddings;
-        TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_params_.session_len);
+    if (!engine_param_.session_len) {
+        engine_param_.session_len = attn_param_.max_position_embeddings;
+        TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_param_.session_len);
     }
 
-    if (!engine_params_.max_prefill_token_num) {
-        engine_params_.max_prefill_token_num = 8192;
+    if (!engine_param_.max_prefill_token_num) {
+        engine_param_.max_prefill_token_num = 8192;
         TM_LOG_WARNING("[LlamaTritonModel] `max_prefill_token_num` is not set, default to %d.",
-                       (int)engine_params_.max_prefill_token_num);
+                       (int)engine_param_.max_prefill_token_num);
     }
 
-    if (!engine_params_.max_context_token_num) {
-        engine_params_.max_context_token_num = engine_params_.session_len;
+    if (!engine_param_.max_context_token_num) {
+        engine_param_.max_context_token_num = engine_param_.session_len;
         TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
-                       (int)engine_params_.max_context_token_num);
+                       (int)engine_param_.max_context_token_num);
     }
 
-    if (engine_params_.max_context_token_num <= engine_params_.max_batch_size) {
-        engine_params_.max_context_token_num *= engine_params_.session_len;
-        TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` = %d.", (int)engine_params_.max_context_token_num);
+    if (engine_param_.max_context_token_num <= engine_param_.max_batch_size) {
+        engine_param_.max_context_token_num *= engine_param_.session_len;
+        TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` = %d.", (int)engine_param_.max_context_token_num);
     }
 
-    if (!engine_params_.step_length) {
-        engine_params_.step_length = 1;
+    if (!engine_param_.step_length) {
+        engine_param_.step_length = 1;
     }
 
-    if (!engine_params_.cache_max_block_count) {
-        engine_params_.cache_max_block_count = .95f;
+    if (!engine_param_.cache_max_block_count) {
+        engine_param_.cache_max_block_count = .95f;
         TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.",
-                       engine_params_.cache_max_block_count);
+                       engine_param_.cache_max_block_count);
     }
 
-    if (!cache_block_seq_len_) {
-        cache_block_seq_len_ = 128;
-        TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.", cache_block_seq_len_);
+    if (!attn_param_.cache_block_seq_len) {
+        attn_param_.cache_block_seq_len = 128;
+        TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.",
+                       attn_param_.cache_block_seq_len);
     }
 
-    if (!engine_params_.cache_chunk_size) {
-        engine_params_.cache_chunk_size = engine_params_.cache_max_block_count;
+    if (!engine_param_.cache_chunk_size) {
+        engine_param_.cache_chunk_size = engine_param_.cache_max_block_count;
         TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.",
-                       (int)engine_params_.cache_chunk_size);
+                       (int)engine_param_.cache_chunk_size);
     }
 
-    if (!engine_params_.num_tokens_per_iter) {
-        engine_params_.num_tokens_per_iter = engine_params_.max_context_token_num;
+    if (!engine_param_.num_tokens_per_iter) {
+        engine_param_.num_tokens_per_iter = engine_param_.max_context_token_num;
         TM_LOG_WARNING("[LlamaTritonModel] `num_tokens_per_iter` is not set, default to `max_context_token_num` (%d).",
-                       (int)engine_params_.num_tokens_per_iter);
+                       (int)engine_param_.num_tokens_per_iter);
+    }
+}
+
+template<typename T>
+LlamaTritonModel<T>::~LlamaTritonModel()
+{
+    ft::FT_CHECK(weights_.size() == engines_.size());
+    for (int device_id = 0; device_id < (int)engines_.size(); ++device_id) {
+        // Set device id before destructing CUDA resources
+        ft::check_cuda_error(cudaSetDevice(device_id));
+        engines_[device_id].reset();
+        weights_[device_id].reset();
     }
 }
 
@@ -169,7 +186,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
                                       std::string config):
     tensor_para_size_(tensor_para_size),
     pipeline_para_size_(pipeline_para_size),
-    shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
+    weights_(ft::getDeviceCount()),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
     INIReader reader;
@@ -196,66 +213,65 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
         }
     }
 
-    model_name_          = reader.Get("llama", "model_name");
-    head_num_            = reader.GetInteger("llama", "head_num");
-    kv_head_num_         = reader.GetInteger("llama", "kv_head_num", 0);
-    hidden_units_        = reader.GetInteger("llama", "hidden_units");
-    size_per_head_       = reader.GetInteger("llama", "size_per_head");
-    inter_size_          = reader.GetInteger("llama", "inter_size");
-    num_layer_           = reader.GetInteger("llama", "num_layer");
-    vocab_size_          = reader.GetInteger("llama", "vocab_size");
-    norm_eps_            = reader.GetFloat("llama", "norm_eps");
-    start_id_            = reader.GetInteger("llama", "start_id");
-    end_id_              = reader.GetInteger("llama", "end_id");
-    use_context_fmha_    = reader.GetInteger("llama", "use_context_fmha", 1);
-    cache_block_seq_len_ = reader.GetInteger("llama", "cache_block_seq_len", 0);
-
-    attn_bias_    = reader.GetInteger("llama", "attn_bias", 0);
-    quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
-    group_size_   = reader.GetInteger("llama", "group_size", 0);
+    model_name_                     = reader.Get("llama", "model_name");
+    model_param_.head_num           = reader.GetInteger("llama", "head_num");
+    model_param_.head_dim           = reader.GetInteger("llama", "size_per_head");
+    model_param_.kv_head_num        = reader.GetInteger("llama", "kv_head_num", 0);
+    model_param_.hidden_units       = reader.GetInteger("llama", "hidden_units");
+    model_param_.layer_num          = reader.GetInteger("llama", "num_layer");
+    model_param_.inter_size         = reader.GetInteger("llama", "inter_size");
+    model_param_.vocab_size         = reader.GetInteger("llama", "vocab_size");
+    model_param_.norm_eps           = reader.GetFloat("llama", "norm_eps");
+    model_param_.start_id           = reader.GetInteger("llama", "start_id");
+    model_param_.end_id             = reader.GetInteger("llama", "end_id");
+    attn_param_.cache_block_seq_len = reader.GetInteger("llama", "cache_block_seq_len", 0);
+    model_param_.quant_policy       = reader.GetInteger("llama", "quant_policy", 0);
+
+    // Only weight classes need these
+    attn_bias_  = reader.GetInteger("llama", "attn_bias", 0);
+    group_size_ = reader.GetInteger("llama", "group_size", 0);
 
     // rotary embedding parameters
-    attn_params_.rotary_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
-    attn_params_.rotary_embedding_base   = reader.GetFloat("llama", "rope_theta", 10000.0f);
-    attn_params_.rope_scaling_type       = reader.Get("llama", "rope_scaling_type", "");
-    attn_params_.rope_scaling_factor     = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
-    attn_params_.low_freq_factor         = reader.GetFloat("llama", "low_freq_factor", 1.0);
-    attn_params_.high_freq_factor        = reader.GetFloat("llama", "high_freq_factor", 1.0);
-    attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
-    attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
-    attn_params_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);
-
-    attn_params_.original_max_position_embeddings = reader.GetInteger("llama", "original_max_position_embeddings", 0);
-
-    engine_params_.max_batch_size        = reader.GetInteger("llama", "max_batch_size", 0);
-    engine_params_.max_prefill_token_num = reader.GetInteger("llama", "max_prefill_token_num", 0);
-    engine_params_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0);
-    engine_params_.session_len           = reader.GetInteger("llama", "session_len", 0);
-    engine_params_.step_length           = reader.GetInteger("llama", "step_length", 0);
-
-    engine_params_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0);
-    engine_params_.cache_chunk_size      = reader.GetInteger("llama", "cache_chunk_size", 0);
-    engine_params_.enable_prefix_caching = reader.GetBoolean("llama", "enable_prefix_caching", false);
-
-    engine_params_.num_tokens_per_iter = reader.GetInteger("llama", "num_tokens_per_iter", 0);
-    engine_params_.max_prefill_iters   = reader.GetInteger("llama", "max_prefill_iters", 1);
-
-    lora_params_.policy        = ft::getLoraPolicy(reader.Get("llama", "lora_policy", ""));
-    lora_params_.r             = reader.GetInteger("llama", "lora_r", 0);
-    lora_params_.scale         = reader.GetFloat("llama", "lora_scale", 0);
-    lora_params_.max_wo_r      = reader.GetInteger("llama", "lora_max_wo_r", 0);
-    lora_params_.rank_pattern  = getLoraPattern<int>(reader.Get("llama", "lora_rank_pattern", ""),
-                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_params_.scale_pattern = getLoraPattern<float>(reader.Get("llama", "lora_scale_pattern", ""),
-                                                       [](const std::string& s) { return std::stof(s); });
+    attn_param_.rotary_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
+    attn_param_.rotary_embedding_base   = reader.GetFloat("llama", "rope_theta", 10000.0f);
+    attn_param_.rope_scaling_type       = reader.Get("llama", "rope_scaling_type", "");
+    attn_param_.rope_scaling_factor     = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
+    attn_param_.low_freq_factor         = reader.GetFloat("llama", "low_freq_factor", 1.0);
+    attn_param_.high_freq_factor        = reader.GetFloat("llama", "high_freq_factor", 1.0);
+    attn_param_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
+    attn_param_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
+    attn_param_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);
+
+    attn_param_.original_max_position_embeddings = reader.GetInteger("llama", "original_max_position_embeddings", 0);
+
+    engine_param_.max_batch_size        = reader.GetInteger("llama", "max_batch_size", 0);
+    engine_param_.max_prefill_token_num = reader.GetInteger("llama", "max_prefill_token_num", 0);
+    engine_param_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0);
+    engine_param_.session_len           = reader.GetInteger("llama", "session_len", 0);
+    engine_param_.step_length           = reader.GetInteger("llama", "step_length", 0);
+
+    engine_param_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0);
+    engine_param_.cache_chunk_size      = reader.GetInteger("llama", "cache_chunk_size", 0);
+    engine_param_.enable_prefix_caching = reader.GetBoolean("llama", "enable_prefix_caching", false);
+
+    engine_param_.num_tokens_per_iter = reader.GetInteger("llama", "num_tokens_per_iter", 0);
+    engine_param_.max_prefill_iters   = reader.GetInteger("llama", "max_prefill_iters", 1);
+
+    lora_param_.policy        = ft::getLoraPolicy(reader.Get("llama", "lora_policy", ""));
+    lora_param_.r             = reader.GetInteger("llama", "lora_r", 0);
+    lora_param_.scale         = reader.GetFloat("llama", "lora_scale", 0);
+    lora_param_.max_wo_r      = reader.GetInteger("llama", "lora_max_wo_r", 0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(reader.Get("llama", "lora_rank_pattern", ""),
+                                                   [](const std::string& s) { return std::stoi(s); });
+    lora_param_.scale_pattern = getLoraPattern<float>(reader.Get("llama", "lora_scale_pattern", ""),
+                                                      [](const std::string& s) { return std::stof(s); });
     handleMissingParams();
 
-    shared_state_          = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
+    shared_state_          = std::make_shared<ft::SharedState>();
     shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
 
     const auto device_count = ft::getDeviceCount();
-    shared_instances_.resize(device_count);
-    shared_mutexes_.resize(device_count);
+    engines_.resize(device_count);
 
     const std::string weight_type_str = reader.Get("llama", "weight_type");
     if (weight_type_str == "fp16") {
@@ -282,7 +298,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
 }
 
 template<typename T>
-std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSharedModelInstance(
+std::unique_ptr<ft::Engine<T>> LlamaTritonModel<T>::createSharedModelInstance(
     int                                                               device_id,
     int                                                               rank,
     std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
@@ -291,42 +307,46 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
     ft::check_cuda_error(cudaSetDevice(device_id));
     const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
 
-    /// TODO: this stream handle is leaked
-    cudaStream_t stream{};
-    ft::check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    auto ctx = std::make_unique<ft::Context<T>>();
 
-    auto allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
-    allocator->setStream(stream);
+    ft::check_cuda_error(cudaStreamCreateWithFlags(&ctx->stream, cudaStreamNonBlocking));
+
+    ctx->allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
+    ctx->allocator->setStream(ctx->stream);
 
-    auto peer_allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, true);
-    peer_allocator->setStream(stream);
+    ctx->peer_allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, true);
+    ctx->peer_allocator->setStream(ctx->stream);
 
     cublasHandle_t   cublas_handle;
     cublasLtHandle_t cublaslt_handle;
 
     cublasCreate(&cublas_handle);
     cublasLtCreate(&cublaslt_handle);
-    cublasSetStream(cublas_handle, stream);
+    cublasSetStream(cublas_handle, ctx->stream);
 
-    std::unique_ptr<ft::cublasAlgoMap>   cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
-    std::unique_ptr<std::mutex>          cublas_wrapper_mutex(new std::mutex());
-    std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
-        cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
+    ctx->cublas_algo_map      = std::make_unique<ft::cublasAlgoMap>("gemm_config.in");
+    ctx->cublas_wrapper_mutex = std::make_unique<std::mutex>();
+    ctx->cublas_wrapper       = std::make_unique<ft::cublasMMWrapper>(cublas_handle,
+                                                                cublaslt_handle,
+                                                                ctx->stream,
+                                                                ctx->cublas_algo_map.get(),
+                                                                ctx->cublas_wrapper_mutex.get(),
+                                                                ctx->allocator.get());
+    ctx->linear               = std::make_unique<ft::LlamaLinear<T>>(ctx->cublas_wrapper.get(), ctx->stream);
 
-    std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
-    ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
+    ft::check_cuda_error(cudaGetDeviceProperties(&ctx->cuda_device_prop, device_id));
 
     if (std::is_same<T, half>::value) {
-        cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+        ctx->cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
     }
 #ifdef ENABLE_FP32
     else if (std::is_same<T, float>::value) {
-        cublas_wrapper->setFP32GemmConfig();
+        ctx.cublas_wrapper->setFP32GemmConfig();
     }
 #endif
 #ifdef ENABLE_BF16
     else if (std::is_same<T, __nv_bfloat16>::value) {
-        cublas_wrapper->setBF16GemmConfig();
+        ctx->cublas_wrapper->setBF16GemmConfig();
     }
 #endif
 
@@ -336,42 +356,23 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
     ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
     ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
 
-    auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
-                                                  kv_head_num_,
-                                                  size_per_head_,
-                                                  hidden_units_,
-                                                  inter_size_,
-                                                  num_layer_,
-                                                  vocab_size_,
-                                                  norm_eps_,
-                                                  attn_params_,
-                                                  start_id_,
-                                                  end_id_,
-                                                  cache_block_seq_len_,
-                                                  quant_policy_,
-                                                  use_context_fmha_,
-                                                  engine_params_,
-                                                  lora_params_,
-                                                  shared_state_,
-                                                  shared_weights_[device_id].get(),
+    auto model = std::make_unique<ft::LlamaV2<T>>(model_param_,  //
+                                                  attn_param_,
+                                                  lora_param_,
                                                   tensor_para,
-                                                  stream,
-                                                  cublas_wrapper.get(),
-                                                  allocator.get(),
-                                                  peer_allocator.get(),
-                                                  false,  // is_free_buffer_after_forward,
-                                                  cuda_device_prop_ptr.get());
-
-    return std::make_unique<LlamaTritonSharedModelInstance<T>>(
-        LlamaTritonSharedModelInstance<T>{std::move(allocator),
-                                          std::move(peer_allocator),
-                                          std::move(cublas_algo_map),
-                                          std::move(cublas_wrapper_mutex),
-                                          std::move(cublas_wrapper),
-                                          std::move(cuda_device_prop_ptr),
-                                          shared_weights_[device_id],
-                                          std::move(llama),
-                                          engine_params_.session_len});
+                                                  *ctx,
+                                                  engine_param_.max_batch_size,
+                                                  weights_[device_id]);
+
+    auto engine = std::make_unique<ft::Engine<T>>(engine_param_,  //
+                                                  std::move(model),
+                                                  std::move(ctx),
+                                                  shared_state_,
+                                                  device_id);
+
+    engine->Start();
+
+    return engine;
 }
 
 template<typename T>
@@ -384,13 +385,13 @@ LlamaTritonModel<T>::createModelInstance(int          device_id,
 {
     ft::check_cuda_error(cudaSetDevice(device_id));
 
-    ft::FT_CHECK((bool)shared_instances_[device_id]);
+    ft::FT_CHECK(engines_[device_id] != nullptr);
 
     auto allocator = std::make_unique<ft::Allocator<ft::AllocatorType::CUDA>>(device_id, false);
 
     allocator->setStream(stream);
 
-    return std::make_unique<LlamaTritonModelInstance<T>>(shared_instances_[device_id], std::move(allocator), device_id);
+    return std::make_unique<LlamaTritonModelInstance<T>>(*engines_[device_id], std::move(allocator), device_id);
 }
 
 template<typename T>
@@ -400,22 +401,22 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
     const int tensor_para_rank   = rank % tensor_para_size_;
     const int pipeline_para_rank = rank / tensor_para_size_;
     ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
-    shared_weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(head_num_,
-                                                                      kv_head_num_,
-                                                                      size_per_head_,
-                                                                      hidden_units_,
-                                                                      inter_size_,
-                                                                      vocab_size_,
-                                                                      num_layer_,
-                                                                      attn_bias_,
-                                                                      weight_type_,
-                                                                      group_size_,
-                                                                      lora_params_,
-                                                                      tensor_para_size_,
-                                                                      tensor_para_rank);
+    weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(model_param_.head_num,
+                                                               model_param_.kv_head_num,
+                                                               model_param_.head_dim,
+                                                               model_param_.hidden_units,
+                                                               model_param_.inter_size,
+                                                               model_param_.vocab_size,
+                                                               model_param_.layer_num,
+                                                               attn_bias_,
+                                                               weight_type_,
+                                                               group_size_,
+                                                               lora_param_,
+                                                               tensor_para_size_,
+                                                               tensor_para_rank);
     // model inited with model_dir
     if (model_dir_ != "") {
-        shared_weights_[device_id]->loadModel(model_dir_);
+        weights_[device_id]->loadModel(model_dir_);
     }
     return;
 }
@@ -425,8 +426,8 @@ TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
 {
     ft::check_cuda_error(cudaSetDevice(deviceId));
     // shared_weight should be created before getParams
-    ft::FT_CHECK(shared_weights_[deviceId] != nullptr);
-    ft::TensorMap output = shared_weights_[deviceId]->getParams();
+    ft::FT_CHECK(weights_[deviceId] != nullptr);
+    ft::TensorMap output = weights_[deviceId]->getParams();
     TensorMap     result;
     for (auto [name, tensor] : output) {
         result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data});
@@ -438,12 +439,12 @@ template<typename T>
 void LlamaTritonModel<T>::processWeights(int device_id, int rank)
 {
     ft::check_cuda_error(cudaSetDevice(device_id));
-    ft::FT_CHECK(shared_weights_[device_id] != nullptr);
+    ft::FT_CHECK(weights_[device_id] != nullptr);
 
     cudaDeviceProp props{};
     ft::check_cuda_error(cudaGetDeviceProperties(&props, device_id));
 
-    shared_weights_[device_id]->prepare(props);
+    weights_[device_id]->prepare(props);
     ft::sync_check_cuda_error();
 }
 
@@ -454,35 +455,37 @@ void LlamaTritonModel<T>::createEngine(int
                                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
 {
 
-    auto instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
-    instance->llm->setFfiLock(ffi_lock_);
+    auto engine = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
+    engine->set_ffi_lock(ffi_lock_);
 
     if (weight_type_ == ft::WeightType::kINT4) {
-        instance->llm->tune();
+        engine->model().tune();
     }
 
-    shared_instances_[device_id] = std::move(instance);
+    engines_[device_id] = std::move(engine);
 }
 
 template<typename T>
 std::string LlamaTritonModel<T>::toString()
 {
     std::stringstream ss;
-    ss << "Model: "
-       << "\nhead_num: " << head_num_ << "\nkv_head_num: " << kv_head_num_ << "\nsize_per_head: " << size_per_head_
-       << "\ninter_size: " << inter_size_ << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_
-       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_params_.max_batch_size
-       << "\nmax_prefill_token_num: " << engine_params_.max_prefill_token_num
-       << "\nmax_context_token_num: " << engine_params_.max_context_token_num
-       << "\nsession_len: " << engine_params_.session_len << "\nstep_length: " << engine_params_.step_length
-       << "\ncache_max_entry_count: " << engine_params_.cache_max_block_count
-       << "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << engine_params_.cache_chunk_size
-       << "\nenable_prefix_caching: " << engine_params_.enable_prefix_caching
-       << "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
+    ss << "Model: "  //
+       << "\nhead_num: " << model_param_.head_num << "\nkv_head_num: " << model_param_.kv_head_num
+       << "\nsize_per_head: " << model_param_.head_dim << "\ninter_size: " << model_param_.inter_size
+       << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
+       << "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
+       << "\nmax_context_token_num: " << engine_param_.max_context_token_num
+       << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
+       << "\nmax_prefill_iters: " << engine_param_.max_prefill_iters << "\nsession_len: " << engine_param_.session_len
+       << "\ncache_max_entry_count: " << engine_param_.cache_max_block_count
+       << "\ncache_block_seq_len: " << attn_param_.cache_block_seq_len
+       << "\ncache_chunk_size: " << engine_param_.cache_chunk_size
+       << "\nenable_prefix_caching: " << engine_param_.enable_prefix_caching << "\nstart_id: " << model_param_.start_id
        << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
        << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << "\ngroup_size: " << group_size_
-       << std::endl;
+       << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << model_param_.quant_policy
+       << "\ngroup_size: " << group_size_ << std::endl;
 
     return ss.str();
 }
@@ -495,36 +498,6 @@ void LlamaTritonModel<T>::createCustomComms(
     ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
 }
 
-template<typename T>
-std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
-LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
-{
-    const auto device_count     = ft::getDeviceCount();
-    bool       need_nccl_params = false;
-    // create nccl group when there are non-occupied devices
-    for (int i = 0; i < device_count; ++i) {
-        std::lock_guard<std::mutex> lock(shared_mutexes_[i]);
-        if (shared_instances_[i] == nullptr) {
-            need_nccl_params = true;
-            break;
-        }
-    }
-    if (need_nccl_params) {
-        return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
-    }
-    else {
-        TM_LOG_INFO("Skipping NCCL param creation.");
-
-        const int tensor_para_size   = getTensorParaSize();
-        const int pipeline_para_size = getPipelineParaSize();
-        const int local_comm_size    = multi_node ? device_count : tensor_para_size * pipeline_para_size;
-
-        std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
-        std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
-        return {std::move(tensor_para_params), std::move(pipeline_para_params)};
-    }
-}
-
 template<typename T>
 std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
 {
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 02736e0f23..1a069fcac5 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
@@ -32,9 +33,6 @@
 
 namespace ft = turbomind;
 
-template<typename T>
-struct LlamaTritonSharedModelInstance;
-
 template<typename T>
 struct LlamaTritonModel: public AbstractTransformerModel {
     LlamaTritonModel(size_t      tensor_para_size,
@@ -43,7 +41,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
                      std::string model_dir,
                      std::string config = "");
 
-    ~LlamaTritonModel() = default;
+    ~LlamaTritonModel() override;
 
     std::unique_ptr<AbstractTransformerModelInstance>
     createModelInstance(int                                                               deviceId,
@@ -66,14 +64,11 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
                            int                                                   world_size) override;
 
-    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
-    createNcclParams(const int node_id, const int device_id_start, const bool multi_node) override;
-
     std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
 
     void handleMissingParams();
 
-    void setFfiLock(ffi_api_lock_ctrl_t func)
+    void set_ffi_lock(ffi_api_lock_ctrl_t func)
     {
         ffi_lock_ = func;
     }
@@ -83,41 +78,26 @@ struct LlamaTritonModel: public AbstractTransformerModel {
     int         getPipelineParaSize() override;
 
 private:
-    std::unique_ptr<LlamaTritonSharedModelInstance<T>>
+    std::unique_ptr<ft::Engine<T>>
     createSharedModelInstance(int                                                               deviceId,
                               int                                                               rank,
                               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
                               std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
 
-    size_t                          head_num_;
-    size_t                          kv_head_num_;
-    size_t                          hidden_units_;
-    size_t                          size_per_head_;
-    size_t                          inter_size_;
-    size_t                          num_layer_;
-    size_t                          vocab_size_;
-    turbomind::LlamaAttentionParams attn_params_;
-    turbomind::EngineParams         engine_params_;
-    float                           norm_eps_;
-    int                             start_id_;
-    int                             end_id_;
-    int                             cache_block_seq_len_;
-    int                             use_context_fmha_;
-    size_t                          tensor_para_size_;
-    size_t                          pipeline_para_size_;
-    ft::WeightType                  weight_type_;
-    bool                            attn_bias_;
-    int                             quant_policy_;
-    int                             group_size_;
-    turbomind::LoraParams           lora_params_;
-
-    // shared weights for each device
-    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
-
-    std::shared_ptr<typename ft::LlamaV2<T>::SharedState> shared_state_;
-
-    std::vector<std::shared_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
-    std::deque<std::mutex>                                          shared_mutexes_;  // is locking really needed?
+    ft::ModelParam     model_param_;
+    ft::AttentionParam attn_param_;
+    ft::LoraParam      lora_param_;
+    ft::EngineParam    engine_param_;
+    size_t             tensor_para_size_;
+    size_t             pipeline_para_size_;
+    ft::WeightType     weight_type_;
+    bool               attn_bias_;
+    int                group_size_;
+
+    std::shared_ptr<ft::SharedState> shared_state_;
+    // Weights & engine instances for the ranks
+    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> weights_;
+    std::vector<std::shared_ptr<ft::Engine<T>>>      engines_;
 
     bool is_fp16_;
     int  enable_custom_all_reduce_ = 0;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
index d133d171ef..8221f932ce 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -43,10 +43,10 @@ void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_
 }
 
 template<typename T>
-LlamaTritonModelInstance<T>::LlamaTritonModelInstance(std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
                                                       std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
                                                       int device_id):
-    device_id_{device_id}, instance_(std::move(instance)), allocator_(std::move(allocator))
+    device_id_{device_id}, instance_(&instance), allocator_(std::move(allocator))
 {
 }
 
@@ -147,15 +147,15 @@ LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::str
     const bool   is_return_logits =
         input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data;
 
-    const size_t vocab_size = instance_->llm->vocab_size();
+    const size_t vocab_size = instance_->model().vocab_size();
 
-    allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len, is_return_logits);
+    allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len(), is_return_logits);
 
     std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
         {"output_ids",
          ft::Tensor{ft::MEMORY_CPU,
                     ft::TYPE_UINT32,
-                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
+                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len()},
                     d_output_ids_}},
         {"sequence_length",
          ft::Tensor{ft::MEMORY_CPU,
@@ -177,7 +177,7 @@ LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::str
     }
 
     if (input_tensors->count("logprobs")) {
-        size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len) + 1;
+        size_t max_logprob_length = std::min((int)max_request_output_len, instance_->session_len()) + 1;
         h_logprob_vals_           = (float*)std::realloc(
             h_logprob_vals_, sizeof(float) * request_batch_size * beam_width * max_logprob_length * ft::kMaxLogProb);
         h_logprob_indexes_ = (uint32_t*)std::realloc(h_logprob_indexes_,
@@ -223,7 +223,7 @@ LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::str
         }
 
         ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
-        instance_->llm->forward(&output_tensors, &ft_input_tensors, {instance_comm, callback});
+        instance_->Submit(&output_tensors, &ft_input_tensors, {instance_comm, callback});
         // ! stream synced by the model before returning
     }
     catch (...) {
@@ -251,8 +251,10 @@ void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size
     d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width);
 
     if (is_return_logits) {
-        d_output_logits_ = (float*)allocator_->reMalloc(
-            d_output_logits_, sizeof(float) * request_batch_size * max_input_len * instance_->llm->vocab_size(), false);
+        d_output_logits_ = (float*)allocator_->reMalloc(d_output_logits_,
+                                                        sizeof(float) * request_batch_size * max_input_len
+                                                            * instance_->model().vocab_size(),
+                                                        false);
     }
 }
 
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
index e33b616f73..08088c05d5 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
@@ -27,23 +28,10 @@
 
 namespace ft = turbomind;
 
-template<typename T>
-struct LlamaTritonSharedModelInstance {
-    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator;
-    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> peer_allocator;
-    std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map;
-    std::unique_ptr<std::mutex>                             cublas_wrapper_mutex;
-    std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper;
-    std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr;
-    std::shared_ptr<ft::LlamaWeight<T>>                     llm_weight;
-    std::unique_ptr<ft::LlamaV2<T>>                         llm;
-    const int                                               session_len;
-};
-
 template<typename T>
 struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
 
-    LlamaTritonModelInstance(std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+    LlamaTritonModelInstance(ft::Engine<T>&                                          instance,
                              std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator,
                              int                                                     device_id);
     ~LlamaTritonModelInstance();
@@ -62,7 +50,7 @@ struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
     convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
 
 private:
-    const std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance_;
+    ft::Engine<T>*                                                instance_;
     const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
 
     std::unordered_map<std::string, ft::Tensor>
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index b48bc9a1d2..066d75a780 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -306,6 +306,8 @@ using TensorMap = std::unordered_map<std::string, triton::Tensor>;
 struct AbstractTransformerModel {
     static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
 
+    virtual ~AbstractTransformerModel() = default;
+
     virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
     createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
 

From 3ffb0c4c4769253e8b17578f5410d2539385ad17 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Thu, 22 Aug 2024 22:07:17 +0800
Subject: [PATCH 36/39] support openbmb/MiniCPM-V-2_6 (#2351)

* support model convert

* update template and vision model

* update docs

* update README
---
 README.md                                     |   1 +
 README_ja.md                                  |   1 +
 README_zh-CN.md                               |   1 +
 docs/en/multi_modal/minicpmv.md               | 198 +++++++++++++++++-
 docs/en/supported_models/supported_models.md  |  57 ++---
 docs/zh_cn/multi_modal/minicpmv.md            | 196 ++++++++++++++++-
 .../supported_models/supported_models.md      |  57 ++---
 lmdeploy/lite/apis/auto_awq.py                |  12 +-
 lmdeploy/model.py                             |   7 +-
 lmdeploy/vl/model/minicpmv.py                 | 122 ++++++++---
 lmdeploy/vl/templates.py                      |  45 +++-
 11 files changed, 592 insertions(+), 105 deletions(-)

diff --git a/README.md b/README.md
index 03b3e160bc..d545da56b7 100644
--- a/README.md
+++ b/README.md
@@ -151,6 +151,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
   <li>MiniCPM-Llama3-V-2_5</li>
+  <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
diff --git a/README_ja.md b/README_ja.md
index 16a5f4bd70..86facc7fde 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -152,6 +152,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
   <li>MiniCPM-Llama3-V-2_5</li>
+  <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 1bac57543c..71ee2b15f0 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -152,6 +152,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
   <li>MiniCPM-Llama3-V-2_5</li>
+  <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
index 73bacf82a4..ed9ad71d20 100644
--- a/docs/en/multi_modal/minicpmv.md
+++ b/docs/en/multi_modal/minicpmv.md
@@ -1,24 +1,208 @@
 # MiniCPM-V
 
-## Introduction
+LMDeploy supports the following MiniCPM-V series of models, which are detailed in the table below:
 
-[MiniCPM-V](https://github.com/OpenBMB/MiniCPM-V) is a series of end-side multimodal LLMs (MLLMs) designed for vision-language understanding. LMDeploy supports MiniCPM-Llama3-V-2_5 model [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) in TurboMind engine.
+|        Model         | Supported Inference Engine |
+| :------------------: | :------------------------: |
+| MiniCPM-Llama3-V-2_5 |         TurboMind          |
+|    MiniCPM-V-2_6     |         TurboMind          |
 
-## Quick Start
+The next chapter demonstrates how to deploy an MiniCPM-V model using LMDeploy, with [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) as an example.
 
-Please install LMDeploy by following the [installation guide](../installation.md)
+## Installation
 
-### Offline inference pipeline
+Please install LMDeploy by following the [installation guide](../installation.md).
 
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For detailed information, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
 
 ```python
 from lmdeploy import pipeline
 from lmdeploy.vl import load_image
 
-pipe = pipeline('openbmb/MiniCPM-Llama3-V-2_5')
+pipe = pipeline('openbmb/MiniCPM-V-2_6')
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
 response = pipe(('describe this image', image))
 print(response)
 ```
+
+More examples are listed below:
+
+<details>
+  <summary>
+    <b>Chat with multiple images</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_slice_nums=9, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_slice_nums=9, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>In-context few-shot learning</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+
+question = "production date"
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='example1.jpg')),
+    ]),
+    dict(role='assistant', content='2023.08.04'),
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='example2.jpg')),
+    ]),
+    dict(role='assistant', content='2007.04.24'),
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='test.jpg')),
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>Chat with video</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl.utils import encode_image_base64
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from decord import VideoReader, cpu    # pip install decord
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+
+MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
+def encode_video(video_path):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    print('num frames:', len(frames))
+    return frames
+
+video_path="video_test.mp4"
+frames = encode_video(video_path)
+question = "Describe the video"
+
+content=[dict(type='text', text=question)]
+for frame in frames:
+    content.append(dict(type='image_url', image_url=dict(use_image_id=False, max_slice_nums=2,
+        url=f'data:image/jpeg;base64,{encode_image_base64(frame)}')))
+
+messages = [dict(role='user', content=content)]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+## Online serving
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+```
+
+You can also start the service using the official lmdeploy docker image:
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:v0.5.3-cu12 \
+    lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+```
+
+The docker compose is another option. Create a `docker-compose.yml` configuration file in the root directory of the lmdeploy project as follows:
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:v0.5.3-cu12
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+Then, you can execute the startup command as below:
+
+```shell
+docker-compose up -d
+```
+
+If you find the following logs after running `docker logs -f lmdeploy`, it means the service launches successfully.
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+The arguments of `lmdeploy serve api_server` can be reviewed in detail by `lmdeploy serve api_server -h`.
+
+More information about `api_server` as well as how to access the service can be found from [here](api_server_vl.md)
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 40300d82c1..7f6a4cffa3 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -2,34 +2,35 @@
 
 ## Models supported by TurboMind
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        |  v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       |    2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        MiniCPM        | Llama3-V-2_5 | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Model         |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  | 7B, 4khd-7B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        | 1.8B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |  7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |  7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        | v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       |   2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |     7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |     9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 
 "-" means not verified yet.
 
diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
index 20fc1eac92..8598d4e80b 100644
--- a/docs/zh_cn/multi_modal/minicpmv.md
+++ b/docs/zh_cn/multi_modal/minicpmv.md
@@ -1,16 +1,19 @@
 # MiniCPM-V
 
-## 简介
+LMDeploy 支持 MiniCPM-V 系列模型，具体如下：
 
-[MiniCPM-V](https://github.com/OpenBMB/MiniCPM-V) 是面向图文理解的端侧多模态大模型系列。该系列模型接受图像和文本输入，并提供高质量的文本输出。 LMDeploy 支持了 [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) 模型，通过 TurboMind 引擎推理。
+|        Model         | Supported Inference Engine |
+| :------------------: | :------------------------: |
+| MiniCPM-Llama3-V-2_5 |         TurboMind          |
+|    MiniCPM-V-2_6     |         TurboMind          |
 
-## 快速开始
+本文将以[MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)为例，演示使用 LMDeploy 部署 MiniCPM-V 系列模型的方法
 
-### 安装
+## 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy
+请参考[安装文档](../installation.md)安装 LMDeploy。
 
-### 离线推理 pipeline
+## 离线推理
 
 以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
 
@@ -18,9 +21,188 @@
 from lmdeploy import pipeline
 from lmdeploy.vl import load_image
 
-pipe = pipeline('openbmb/MiniCPM-Llama3-V-2_5')
+pipe = pipeline('openbmb/MiniCPM-V-2_6')
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
 response = pipe(('describe this image', image))
 print(response)
 ```
+
+更多例子如下：
+
+<details>
+  <summary>
+    <b>多张图片，多轮对话</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text='Describe the two images in detail.'),
+        dict(type='image_url', image_url=dict(max_slice_nums=9, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image1.jpg')),
+        dict(type='image_url', image_url=dict(max_slice_nums=9, url='https://raw.githubusercontent.com/OpenGVLab/InternVL/main/internvl_chat/examples/image2.jpg'))
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+
+messages.append(dict(role='assistant', content=out.text))
+messages.append(dict(role='user', content='What are the similarities and differences between these two images.'))
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>上下文小样本学习</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+
+question = "production date"
+messages = [
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='example1.jpg')),
+    ]),
+    dict(role='assistant', content='2023.08.04'),
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='example2.jpg')),
+    ]),
+    dict(role='assistant', content='2007.04.24'),
+    dict(role='user', content=[
+        dict(type='text', text=question),
+        dict(type='image_url', image_url=dict(url='test.jpg')),
+    ])
+]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+<details>
+  <summary>
+    <b>视频对话</b>
+  </summary>
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+from lmdeploy.vl.utils import encode_image_base64
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from decord import VideoReader, cpu    # pip install decord
+
+pipe = pipeline('openbmb/MiniCPM-V-2_6', log_level='INFO')
+
+MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
+def encode_video(video_path):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    print('num frames:', len(frames))
+    return frames
+
+video_path="video_test.mp4"
+frames = encode_video(video_path)
+question = "Describe the video"
+
+content=[dict(type='text', text=question)]
+for frame in frames:
+    content.append(dict(type='image_url', image_url=dict(use_image_id=False, max_slice_nums=2,
+        url=f'data:image/jpeg;base64,{encode_image_base64(frame)}')))
+
+messages = [dict(role='user', content=content)]
+out = pipe(messages, gen_config=GenerationConfig(top_k=1))
+print(out.text)
+```
+
+</details>
+
+## 在线服务
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+```
+
+也可以基于 LMDeploy 的 docker 启动服务：
+
+```shell
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 23333:23333 \
+    --ipc=host \
+    openmmlab/lmdeploy:v0.5.3-cu12 \
+    lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+```
+
+Docker compose 的方式也是一种选择。在 LMDeploy 代码库的根目录下创建`docker-compose.yml`文件，内容参考如下：
+
+```yaml
+version: '3.5'
+
+services:
+  lmdeploy:
+    container_name: lmdeploy
+    image: openmmlab/lmdeploy:v0.5.3-cu12
+    ports:
+      - "23333:23333"
+    environment:
+      HUGGING_FACE_HUB_TOKEN: <secret>
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+    stdin_open: true
+    tty: true
+    ipc: host
+    command: lmdeploy serve api_server openbmb/MiniCPM-V-2_6
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+```
+
+然后，你就可以执行命令启动服务了：
+
+```shell
+docker-compose up -d
+```
+
+通过`docker logs -f lmdeploy`可以查看启动的日志信息，如果发现类似下方的日志信息，就表明服务启动成功了。
+
+```text
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+HINT:    Please open  http://0.0.0.0:23333   in a browser for detailed api usage!!!
+INFO:     Started server process [2439]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on  http://0.0.0.0:23333  (Press CTRL+C to quit)
+```
+
+有关 `lmdeploy serve api_server` 的详细参数可以通过`lmdeploy serve api_server -h`查阅。
+
+关于 `api_server` 更多的介绍，以及访问 `api_server` 的方法，请阅读[此处](api_server_vl.md)
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index fca936c6f2..e2adbbcb07 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -2,34 +2,35 @@
 
 ## TurboMind 支持的模型
 
-|         Model         |     Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
-| :-------------------: | :----------: | :--: | :-------: | :-----: | :-----: | :---: |
-|         Llama         |   7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama2         |   7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Llama3         |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Llama3.1        |   8B, 70B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM        |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM2       |   7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM2.5      |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|  InternLM-XComposer2  | 7B, 4khd-7B  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-| InternLM-XComposer2.5 |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
-|        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      DeepSeek-VL      |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       Baichuan2       |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|      Code Llama       |   7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|          YI           |   6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|    LLaVA(1.5,1.6)     |   7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL        |  v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternVL2       |    2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|        MiniCPM        | Llama3-V-2_5 | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|    MiniGeminiLlama    |      7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
-|         GLM4          |      9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       CodeGeeX4       |      9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|         Model         |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W4A16 |
+| :-------------------: | :---------: | :--: | :-------: | :-----: | :-----: | :---: |
+|         Llama         |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama2         |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Llama3         |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Llama3.1        |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM        |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternLM2       |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      InternLM2.5      |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|  InternLM-XComposer2  | 7B, 4khd-7B | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| InternLM-XComposer2.5 |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen          | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Qwen1.5        | 1.8B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|         Qwen2         | 1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|        Mistral        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
+|        Qwen-VL        |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|      DeepSeek-VL      |     7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan        |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       Baichuan2       |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|      Code Llama       |  7B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No   |
+|          YI           |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|    LLaVA(1.5,1.6)     |  7B - 34B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL        | v1.1- v1.5  | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|       InternVL2       |   2B-76B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+| MiniCPM-Llama3-V-2_5  |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|     MiniCPM-V-2_6     |      -      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
+|    MiniGeminiLlama    |     7B      | MLLM |    Yes    |    -    |    -    |  Yes  |
+|         GLM4          |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       CodeGeeX4       |     9B      | LLM  |    Yes    |   Yes   |   Yes   |   -   |
 
 “-” 表示还没有验证。
 
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 1c932eb9d6..6cd9b1fd42 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 import os.path as osp
 import shutil
 
@@ -12,8 +13,6 @@
 
 from .calibrate import LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate
 
-# from lmdeploy.lite.utils.export_turbomind import export_turbomind_config
-
 NORM_TYPE_MAP = NORM_TYPE_MAP  # legacy
 
 
@@ -33,6 +32,15 @@ def save_vl_model(vl_model, model_path, dst_path):
                 shutil.copy(tmp_path, osp.join(dst_path, name))
             elif osp.isdir(tmp_path):
                 shutil.copytree(tmp_path, osp.join(dst_path, name))
+    # AutoProcessor files
+    allfiles = os.listdir(model_path)
+    for file in allfiles:
+        if not file.endswith('.py'):
+            continue
+        copy_src = osp.join(model_path, file)
+        copy_dst = osp.join(dst_path, file)
+        if not osp.exists(copy_dst):
+            shutil.copyfile(copy_src, copy_dst)
 
 
 def auto_awq(model: str,
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 9083463de1..3bf5796693 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -348,7 +348,7 @@ def match(cls, model_path: str) -> Optional[str]:
         """
         path = model_path.lower()
         if 'llava' in path and 'v1' in path and 'v1.6-34b' not in path \
-            and 'mistral' not in path:
+                and 'mistral' not in path:
             return 'llava-v1'
         elif 'llava-1.5' in path:
             return 'llava-v1'
@@ -802,7 +802,7 @@ def __init__(
 - Required parameters MUST be specified
 - Only call one function at a time
 - Put the entire function call reply on one line"
-- Always add your sources when using search results to answer the user query\n\n""",  #  noqa
+- Always add your sources when using search results to answer the user query\n\n""",  # noqa
             knowledge='Cutting Knowledge Date: December 2023\nToday Date: 23 Jul 2024\n\n',
             meta_instruction='You are a helpful assistant.',
             ipython='<|start_header_id|>ipython<|end_header_id|>\n\n',
@@ -883,6 +883,7 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'llama3_1'
 
 
+@MODELS.register_module(name='minicpmv-2d6')
 @MODELS.register_module(name='qwen')
 class Qwen7BChat(BaseChatTemplate):
     """Chat template for Qwen-7B-Chat."""
@@ -918,6 +919,8 @@ def match(cls, model_path: str) -> Optional[str]:
         """
         if 'qwen' in model_path.lower():
             return 'qwen'
+        if 'minicpm-v-2_6' in model_path.lower():
+            return 'minicpmv-2d6'
 
 
 @MODELS.register_module(name='codellama')
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
index 3728a11544..6ced27992b 100644
--- a/lmdeploy/vl/model/minicpmv.py
+++ b/lmdeploy/vl/model/minicpmv.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import List
+from typing import Dict, List
 
 import torch
 from PIL.Image import Image
@@ -48,25 +48,42 @@ def build_model(self):
             device=model.resampler.proj.device)
         self.config = config
         self.model = model.eval()
-
-        if hasattr(config, 'vision_config'):
-            self._forward_func = self._forward_v2_5
-        else:
-            self._forward_func = self._forward_v2
-
-        # adapt new code commit 287e3f85
-        if not hasattr(model, 'slice_image'):
+        self.init_forward_func()
+
+    def init_forward_func(self):
+        if not hasattr(self.config, 'version'):
+            msg = 'LMDeploy only support `MiniCPM-V-2_6` and '\
+                '`MiniCPM-Llama3-V-2_5`.\nCan not find `version` in config, ' \
+                'please consider update the huggingface model.'
+            logger.warn(msg)
+
+        self._forward_func = self._forward_v2_5
+        if hasattr(self.config, 'version'):
+            version = str(self.config.version)
+            if version == '2.6':
+                self._forward_func = self._forward_v2_6
+
+        if self._forward_func == self._forward_v2_5:
+            logger.info('using _forward_v2_5')
+            if not hasattr(self.model, 'slice_image'):
+                # adapt new code commit 287e3f85 (MiniCPM-Llama3-V-2_5)
+                from transformers import AutoProcessor
+                processor = AutoProcessor.from_pretrained(
+                    self.model_path, trust_remote_code=True)
+                self.model.slice_image = processor.image_processor.slice_image
+
+                def _reshape_by_patch(x):
+                    out = x.cpu().numpy()
+                    out = processor.image_processor.reshape_by_patch(out)
+                    return torch.from_numpy(out).to(device=x.device)
+
+                self.model.reshape_by_patch = _reshape_by_patch
+
+        if self._forward_func == self._forward_v2_6:
+            logger.info('using _forward_v2_6')
             from transformers import AutoProcessor
-            processor = AutoProcessor.from_pretrained(self.model_path,
-                                                      trust_remote_code=True)
-            model.slice_image = processor.image_processor.slice_image
-
-            def _reshape_by_patch(x):
-                out = x.cpu().numpy()
-                out = processor.image_processor.reshape_by_patch(out)
-                return torch.from_numpy(out).to(device=x.device)
-
-            model.reshape_by_patch = _reshape_by_patch
+            self.model.processor = AutoProcessor.from_pretrained(
+                self.model_path, trust_remote_code=True)
 
     def _get_slice_image(self, image: Image):
         slice_images = []
@@ -90,11 +107,7 @@ def _reshape_by_patch(self, slice_images):
             tgt_sizes.append(torch.Tensor([H, W]).type(torch.int32))
         return patches, tgt_sizes
 
-    def _forward_v2(self, images: List[Image]):
-        """forward for MiniCPM-V-2."""
-        raise NotImplementedError
-
-    def _forward_v2_5(self, images: List[Image]):
+    def _forward_v2_5(self, images: List[Image], params: List[Dict] = None):
         """forward for MiniCPM-Llama3-V-2_5."""
         patches = []
         tgt_sizes = []
@@ -137,8 +150,65 @@ def _forward_v2_5(self, images: List[Image]):
 
         return outputs
 
+    def _forward_v2_6(self, images: List[Image], params: List[Dict] = None):
+        """forward for MiniCPM-V-2_6."""
+        patches = []
+        tgt_sizes = []
+        best_grids = []
+        num_patches = []
+        max_slice_nums = self.model.processor.image_processor.max_slice_nums
+        use_image_id = self.model.processor.image_processor.use_image_id
+        for image, param in zip(images, params):
+            max_slice_nums = param.get('max_slice_nums', max_slice_nums)
+            use_image_id = param.get('use_image_id', use_image_id)
+            outputs = self.model.processor.image_processor(
+                image, max_slice_nums=max_slice_nums)
+            patches.extend(outputs['pixel_values'][0])
+            num_patches.append(len(outputs['pixel_values'][0]))
+            tgt_sizes.extend(outputs['tgt_sizes'][0])
+            grid = self.model.processor.image_processor.get_sliced_grid(
+                image_size=image.size, max_slice_nums=max_slice_nums)
+            best_grids.append(grid)
+
+        patches = [
+            torch.as_tensor(x).to(dtype=torch.half, device=self.model.device)
+            for x in patches
+        ]
+        patches = [x.flatten(end_dim=1).permute(1, 0) for x in patches]
+        tgt_sizes = [torch.as_tensor(x) for x in tgt_sizes]
+        tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+        max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(patches,
+                                                           batch_first=True,
+                                                           padding_value=0.0)
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2,
+                                                    1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros((B, 1, max_patches),
+                                      dtype=torch.bool,
+                                      device=self.model.device)
+        for i in range(B):
+            patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        vision_embedding = self.model.vpm(
+            all_pixel_values.type(torch.half),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes).last_hidden_state
+        vision_embedding = self.model.resampler(vision_embedding, tgt_sizes)
+        vision_embedding = torch.split(vision_embedding, num_patches, 0)
+        outputs = []
+        for embeddings, grid in zip(vision_embedding, best_grids):
+            embeddings = embeddings.cpu()  # n x d x h
+            outputs.append(
+                dict(embeddings=embeddings,
+                     grid=grid,
+                     use_image_id=use_image_id))
+
+        return outputs
+
     @torch.no_grad()
-    def forward(self, images: List[Image]) -> List[torch.Tensor]:
+    def forward(self,
+                images: List[Image],
+                params: List[Dict] = None) -> List[torch.Tensor]:
         """forward."""
         images = [x.convert('RGB') for x in images]
-        return self._forward_func(images)
+        return self._forward_func(images, params)
diff --git a/lmdeploy/vl/templates.py b/lmdeploy/vl/templates.py
index fe7f01edfe..0339d10a66 100644
--- a/lmdeploy/vl/templates.py
+++ b/lmdeploy/vl/templates.py
@@ -279,7 +279,7 @@ def append_image_token(self, prompt, num_images: int):
 
 
 class MiniCPMVTempateWrapper(VLChatTemplateWrapper):
-    """MiniCPMV chat template."""
+    """MiniCPM-Llama3-V-2_5 chat template."""
 
     def append_image_token(self, prompt, num_images: int):
         return f'<image>{IMAGE_TOKEN}</image>\n' * num_images + prompt
@@ -307,6 +307,37 @@ def update_image_token(self, prompt, features):
         return _prompt, _features
 
 
+class MiniCPMV26TempateWrapper(MiniCPMVTempateWrapper):
+    """MiniCPM-V-2_6 chat template."""
+
+    def update_image_token(self, prompt, features):
+        _features = []
+        _prompt = []
+        segs = prompt.split(f'<image>{IMAGE_TOKEN}</image>\n')
+        idx = 0
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(features):
+                _feat = features[i - 1]['embeddings'].split(1)
+                _feat = [x.squeeze() for x in _feat]
+                _features.extend(_feat)
+                _seg = f'<image>{IMAGE_TOKEN}</image>'
+                if features[i - 1].get('use_image_id', False):
+                    _seg = f'<image_id>{idx}</image_id>' + _seg
+                    idx += 1
+                if len(_feat) > 1:
+                    grid = features[i - 1]['grid']
+                    if grid is not None:
+                        _slice = '\n'.join(
+                            [f'<slice>{IMAGE_TOKEN}</slice>' * grid[0]] *
+                            grid[1])
+                        _seg = _seg + _slice
+                _seg += '\n'
+                _prompt.append(_seg)
+            _prompt.append(seg)
+        _prompt = ''.join(_prompt)
+        return _prompt, _features
+
+
 class GLM4VChatTemplateWrapper(VLChatTemplateWrapper):
     """glm-4v chat template."""
     pass
@@ -316,11 +347,10 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
                            model_name: str) -> VLChatTemplateWrapper:
     """get vision language prompt template."""
     assert type(chat_template) != type(BaseModel()), 'failed to match ' \
-        'chat template, please explicit set chat_template_config' # noqa E721
+        'chat template, please explicit set chat_template_config'  # noqa E721
     if model_name == 'yi-vl':
         return YiVLChatTemplateWrapper(chat_template)
-
-    arch, _ = get_model_arch(model_path)
+    arch, cfg = get_model_arch(model_path)
     if arch == 'QWenLMHeadModel':
         return QwenVLChatTemplateWrapper(chat_template)
     elif arch in [
@@ -340,7 +370,12 @@ def get_vl_prompt_template(model_path: str, chat_template: BaseModel,
     elif arch in ['MiniGeminiLlamaForCausalLM', 'MGMLlamaForCausalLM']:
         return MiniGeminiLlamaTempateWrapper(chat_template)
     elif arch == 'MiniCPMV':
-        return MiniCPMVTempateWrapper(chat_template)
+        version_map = {
+            '2.5': MiniCPMVTempateWrapper,
+            '2.6': MiniCPMV26TempateWrapper
+        }
+        version = str(getattr(cfg, 'version', '2.5'))
+        return version_map[version](chat_template)
     elif arch == 'ChatGLMModel':
         return GLM4VChatTemplateWrapper(chat_template)
     raise ValueError(f'unsupported vl_prompt_template with arch {arch}')

From 0772c1073c35c24c5e8ed9970f52d5047d44ad5b Mon Sep 17 00:00:00 2001
From: DearPlanet <junsong.zhang2021.work@outlook.com>
Date: Fri, 23 Aug 2024 16:28:02 +0800
Subject: [PATCH 37/39] feat(server): enable `seed` parameter for openai
 compatible server. (#2353)

* feat(server): enable `seed` parameter for openai compatible server.

* refactor: fix format issue
---
 lmdeploy/serve/openai/api_server.py | 15 ++++++++++++---
 lmdeploy/serve/openai/protocol.py   |  3 +++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 6eea1066cc..c434faf86f 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -356,6 +356,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         except Exception as e:
             return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
 
+    random_seed = request.seed if request.seed else None
+
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens,
         logprobs=gen_logprobs,
@@ -366,7 +368,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
         skip_special_tokens=request.skip_special_tokens,
-        logits_processors=logits_processors)
+        logits_processors=logits_processors,
+        random_seed=random_seed)
 
     tools = None
     if request.tools and request.tool_choice != 'none':
@@ -583,6 +586,8 @@ async def completions_v1(request: CompletionRequest,
         request.prompt = [request.prompt]
     if isinstance(request.stop, str):
         request.stop = [request.stop]
+    random_seed = request.seed if request.seed else None
+
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens if request.max_tokens else 512,
         logprobs=request.logprobs,
@@ -592,7 +597,8 @@ async def completions_v1(request: CompletionRequest,
         repetition_penalty=request.repetition_penalty,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
-        skip_special_tokens=request.skip_special_tokens)
+        skip_special_tokens=request.skip_special_tokens,
+        random_seed=random_seed)
     generators = []
     for i in range(len(request.prompt)):
         result_generator = VariableInterface.async_engine.generate(
@@ -831,6 +837,8 @@ async def chat_interactive_v1(request: GenerateRequest,
     if isinstance(request.stop, str):
         request.stop = [request.stop]
 
+    random_seed = request.seed if request.seed else None
+
     gen_config = GenerationConfig(
         max_new_tokens=request.request_output_len,
         top_p=request.top_p,
@@ -839,7 +847,8 @@ async def chat_interactive_v1(request: GenerateRequest,
         repetition_penalty=request.repetition_penalty,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
-        skip_special_tokens=request.skip_special_tokens)
+        skip_special_tokens=request.skip_special_tokens,
+        random_seed=random_seed)
     if request.image_url:
         from lmdeploy.vl import load_image
         if isinstance(request.image_url, List):
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 9455fe584c..48c46cae25 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -115,6 +115,7 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: Optional[bool] = False
     skip_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40
+    seed: Optional[int] = None
 
 
 class FunctionResponse(BaseModel):
@@ -229,6 +230,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: Optional[bool] = False
     skip_special_tokens: Optional[bool] = True
     top_k: Optional[int] = 40  # for opencompass
+    seed: Optional[int] = None
 
 
 class CompletionResponseChoice(BaseModel):
@@ -315,6 +317,7 @@ class GenerateRequest(BaseModel):
     skip_special_tokens: Optional[bool] = True
     cancel: Optional[bool] = False  # cancel a responding request
     adapter_name: Optional[str] = Field(default=None, examples=[None])
+    seed: Optional[int] = None
 
 
 class GenerateResponse(BaseModel):

From ad399f32b3d6a08082f2a751622adeec89c672dc Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Fri, 23 Aug 2024 20:20:44 +0800
Subject: [PATCH 38/39] Support phi3.5 for pytorch engine (#2361)

* support phi3.5

* update docs

* fix template

* add doc for phi3 vision

* update docs
---
 README.md                                     |   3 +
 README_ja.md                                  |   3 +
 README_zh-CN.md                               |   3 +
 docs/en/multi_modal/index.rst                 |   1 +
 docs/en/multi_modal/phi3.md                   |  77 +++++++++++
 docs/en/supported_models/supported_models.md  |   3 +
 docs/zh_cn/multi_modal/index.rst              |   1 +
 docs/zh_cn/multi_modal/phi3.md                |  76 +++++++++++
 .../supported_models/supported_models.md      |   3 +
 lmdeploy/model.py                             |   4 +-
 lmdeploy/pytorch/models/module_map.py         |  18 +++
 lmdeploy/pytorch/models/phi3_moe.py           | 126 ++++++++++++++++++
 lmdeploy/pytorch/supported_models.py          |   2 +
 tests/test_lmdeploy/test_model.py             |  34 +++++
 14 files changed, 352 insertions(+), 2 deletions(-)
 create mode 100644 docs/en/multi_modal/phi3.md
 create mode 100644 docs/zh_cn/multi_modal/phi3.md
 create mode 100644 lmdeploy/pytorch/models/phi3_moe.py

diff --git a/README.md b/README.md
index d545da56b7..a0db34e369 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,8 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Dbrx (132B)</li>
   <li>StarCoder2 (3B - 15B)</li>
   <li>Phi-3-mini (3.8B)</li>
+  <li>Phi-3.5-mini (3.8B)</li>
+  <li>Phi-3.5-MoE (16x3.8B)</li>
 </ul>
 </td>
 <td>
@@ -153,6 +155,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>MiniCPM-Llama3-V-2_5</li>
   <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
+  <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
 </td>
diff --git a/README_ja.md b/README_ja.md
index 86facc7fde..94e3eb7b6c 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -137,6 +137,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Dbrx (132B)</li>
   <li>StarCoder2 (3B - 15B)</li>
   <li>Phi-3-mini (3.8B)</li>
+  <li>Phi-3.5-mini (3.8B)</li>
+  <li>Phi-3.5-MoE (16x3.8B)</li>
 </ul>
 </td>
 <td>
@@ -154,6 +156,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>MiniCPM-Llama3-V-2_5</li>
   <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
+  <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
 </td>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 71ee2b15f0..79d551e3e3 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -137,6 +137,8 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Dbrx (132B)</li>
   <li>StarCoder2 (3B - 15B)</li>
   <li>Phi-3-mini (3.8B)</li>
+  <li>Phi-3.5-mini (3.8B)</li>
+  <li>Phi-3.5-MoE (16x3.8B)</li>
 </ul>
 </td>
 <td>
@@ -154,6 +156,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>MiniCPM-Llama3-V-2_5</li>
   <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
+  <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
 </td>
diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index 3c6061e776..4218e5c2eb 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -10,3 +10,4 @@ Vision-Language Models
    xcomposer2d5.md
    cogvlm.md
    minicpmv.md
+   phi3.md
diff --git a/docs/en/multi_modal/phi3.md b/docs/en/multi_modal/phi3.md
new file mode 100644
index 0000000000..a801618b35
--- /dev/null
+++ b/docs/en/multi_modal/phi3.md
@@ -0,0 +1,77 @@
+# Phi-3 Vision
+
+## Introduction
+
+[Phi-3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) is a family of small language and multi-modal models from MicroSoft. LMDeploy supports the multi-modal models as below.
+
+|                                                Model                                                | Size | Supported Inference Engine |
+| :-------------------------------------------------------------------------------------------------: | :--: | :------------------------: |
+| [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 4.2B |          PyTorch           |
+|    [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)    | 4.2B |          PyTorch           |
+
+The next chapter demonstrates how to deploy an Phi-3 model using LMDeploy, with [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) as an example.
+
+## Installation
+
+Please install LMDeploy by following the [installation guide](../installation.md) and install the dependency [Flash-Attention](https://github.com/Dao-AILab/flash-attention)
+
+```shell
+# It is recommended to find the whl package that matches the environment from the releases on https://github.com/Dao-AILab/flash-attention.
+pip install flash-attn
+```
+
+## Offline inference
+
+The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('microsoft/Phi-3.5-vision-instruct')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+## Online serving
+
+### Launch Service
+
+You can launch the server by the `lmdeploy serve api_server` CLI:
+
+```shell
+lmdeploy serve api_server microsoft/Phi-3.5-vision-instruct
+```
+
+### Integrate with `OpenAI`
+
+Here is an example of interaction with the endpoint `v1/chat/completions` service via the openai package.
+Before running it, please install the openai package by `pip install openai`
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 7f6a4cffa3..961d8af585 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -76,3 +76,6 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 |      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
 |   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index c27b420e28..2307127e75 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -10,3 +10,4 @@
    xcomposer2d5.md
    cogvlm.md
    minicpmv.md
+   phi3.md
diff --git a/docs/zh_cn/multi_modal/phi3.md b/docs/zh_cn/multi_modal/phi3.md
new file mode 100644
index 0000000000..2ed120344b
--- /dev/null
+++ b/docs/zh_cn/multi_modal/phi3.md
@@ -0,0 +1,76 @@
+# Phi-3 Vision
+
+## 简介
+
+[Phi-3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) 是微软发布的轻量级系列模型，LMDeploy支持了其中的多模态模型如下：
+
+|                                                Model                                                | Size | Supported Inference Engine |
+| :-------------------------------------------------------------------------------------------------: | :--: | :------------------------: |
+| [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 4.2B |          PyTorch           |
+|    [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)    | 4.2B |          PyTorch           |
+
+本文将以[microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)为例，演示使用 LMDeploy 部署 Phi-3 系列多模态模型的方法
+
+## 安装
+
+请参考[安装文档](../installation.md)安装 LMDeploy，并安装该模型的依赖。
+
+```shell
+# 建议从https://github.com/Dao-AILab/flash-attention/releases寻找和环境匹配的whl包
+pip install flash-attn
+```
+
+## 离线推理 pipeline
+
+以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.vl import load_image
+
+pipe = pipeline('microsoft/Phi-3.5-vision-instruct')
+
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+response = pipe(('describe this image', image))
+print(response)
+```
+
+## 在线服务
+
+### 服务启动
+
+你可以通过 `lmdeploy serve api_server` CLI 工具启动服务：
+
+```shell
+lmdeploy serve api_server microsoft/Phi-3.5-vision-instruct
+```
+
+### 使用 openai 接口
+
+以下代码是通过 openai 包使用 `v1/chat/completions` 服务的例子。运行之前，请先安装 openai 包: `pip install openai`。
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'Describe the image please',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index e2adbbcb07..9cf1a1df90 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -76,3 +76,6 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 |      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
 |     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
 |   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 3bf5796693..5b77bedd0d 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1503,8 +1503,8 @@ def __init__(self,
                  eoh='<|end|>\n',
                  assistant='<|assistant|>\n',
                  eoa='<|end|>\n',
-                 separator='\n',
-                 stop_words=['<|end|>', '<|endoftext|>'],
+                 separator='',
+                 stop_words=['<|end|>', '<|endoftext|>', '<|assistant|>'],
                  **kwargs):
         super().__init__(system=system,
                          meta_instruction=meta_instruction,
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index 9d08e48219..00ea636627 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -390,3 +390,21 @@
     'modeling_internlm2.InternLM2FlashAttention2':
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm2.PatchedInternLM2AttentionAscend',
 })
+
+# phi-3.5-moe
+MODULE_MAP.update({
+    'modeling_phimoe.PhiMoEAttention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralAttention',
+    'modeling_phimoe.PhiMoEFlashAttention2':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralAttention',
+    'modeling_phimoe.PhiMoESdpaAttention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralAttention',
+    'modeling_phimoe.PhiMoEModel':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralModel',
+    'modeling_phimoe.PhiMoEBlockSparseTop2MLP':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralBLockSparseTop2MLP',
+    'modeling_phimoe.PhiMoEBLockSparseTop2MLP':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralBLockSparseTop2MLP',
+    'modeling_phimoe.PhiMoESparseMoeBlock':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.phi3_moe.PatchedPhiMoESparseMoeBlock',
+})
diff --git a/lmdeploy/pytorch/models/phi3_moe.py b/lmdeploy/pytorch/models/phi3_moe.py
new file mode 100644
index 0000000000..6fb07272d4
--- /dev/null
+++ b/lmdeploy/pytorch/models/phi3_moe.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import distributed as dist
+from torch import nn
+
+from lmdeploy.pytorch.kernels.fused_moe import fused_moe
+
+
+# from https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/blob/482a9ba0eb0e1fa1671e3560e009d7cec2e5147c/modeling_phimoe.py#L883 # noqa: E501
+def sparsemixer(scores, top_k, jitter_eps):
+    assert top_k == 2
+    final_multipliers = scores.new_empty((scores.shape[0], top_k))
+    final_experts = torch.empty_like(final_multipliers)
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = (
+            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    # apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float('-inf'))
+    selected_experts = max_ind
+
+    final_experts[:, 0:1] = max_ind
+    # compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    final_multipliers[:, 0:1] = masked_gates.gather(dim=-1,
+                                                    index=selected_experts)
+    # masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float('-inf'),
+    )
+    with torch.no_grad():
+        # compute mask for sparsity
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
+                                                           keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = (
+            (mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+
+    final_experts[:, 1:2] = max_ind
+    # apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
+                                                  float('-inf'))
+    selected_experts_top2 = max_ind
+    # compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    final_multipliers[:, 1:2] = masked_gates_top2.gather(
+        dim=-1, index=selected_experts_top2)
+    return final_multipliers, final_experts
+
+
+class PatchedPhiMoESparseMoeBlock(nn.Module):
+
+    def _update_model_fn(self):
+        """update model."""
+        num_experts = self.num_experts
+
+        def __get_meta():
+            exp = self.experts[0]
+            ffn_dim = exp.w1.weight.size(0)
+            hidden_dim = exp.w2.weight.size(0)
+            dtype = exp.w1.weight.dtype
+            device = exp.w1.weight.device
+            return ffn_dim, hidden_dim, dtype, device
+
+        def __copy_assign_param(param, weight):
+            """copy assign."""
+            weight.copy_(param.data)
+            param.data = weight
+
+        ffn_dim, hidden_dim, dtype, device = __get_meta()
+
+        gate_up_weights = torch.empty(num_experts,
+                                      ffn_dim * 2,
+                                      hidden_dim,
+                                      device=device,
+                                      dtype=dtype)
+        down_weights = torch.empty(num_experts,
+                                   hidden_dim,
+                                   ffn_dim,
+                                   device=device,
+                                   dtype=dtype)
+        for exp_id, exp in enumerate(self.experts):
+            __copy_assign_param(exp.w1.weight,
+                                gate_up_weights[exp_id, :ffn_dim])
+            __copy_assign_param(exp.w3.weight, gate_up_weights[exp_id,
+                                                               ffn_dim:])
+            __copy_assign_param(exp.w2.weight, down_weights[exp_id])
+
+        torch.cuda.empty_cache()
+
+        self.register_buffer('gate_up_weights', gate_up_weights)
+        self.register_buffer('down_weights', down_weights)
+
+    @classmethod
+    def _distribute_output_fn(cls, outputs, **kwargs):
+        """Distribution output hook."""
+        dist.all_reduce(outputs[0])
+        return outputs
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """rewrite moe forward."""
+
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        router_logits = self.gate(hidden_states)
+        routing_weights, selected_experts = sparsemixer(
+            router_logits,
+            top_k=2,
+            jitter_eps=self.router_jitter_noise,
+        )
+        out_states = fused_moe(hidden_states,
+                               self.gate_up_weights,
+                               self.down_weights,
+                               routing_weights,
+                               selected_experts,
+                               topk=2,
+                               renormalize=False)
+
+        out_states = out_states.reshape(batch_size, sequence_length, -1)
+        return out_states, router_logits
diff --git a/lmdeploy/pytorch/supported_models.py b/lmdeploy/pytorch/supported_models.py
index ad9eaa84ff..09ded26746 100644
--- a/lmdeploy/pytorch/supported_models.py
+++ b/lmdeploy/pytorch/supported_models.py
@@ -62,6 +62,8 @@
     InternVLChatModel=True,
     # gemma2
     Gemma2ForCausalLM=True,
+    # phi3.5-moe
+    PhiMoEForCausalLM=True,
 )
 
 
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 4aa0ca43fe..a38971e4d0 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -499,3 +499,37 @@ def test_codegeex4():
     ref = tokenizer.apply_chat_template(messages, tokenize=False)
     res = model.messages2prompt(messages)
     assert res.startswith(ref)
+
+
+@pytest.mark.parametrize('model_path_and_name', [
+    'microsoft/Phi-3-mini-128k-instruct',
+    'microsoft/Phi-3-vision-128k-instruct',
+    'microsoft/Phi-3.5-mini-instruct',
+    'microsoft/Phi-3.5-vision-instruct',
+    'microsoft/Phi-3.5-MoE-instruct',
+])
+def test_phi3(model_path_and_name):
+    deduced_name = best_match_model(model_path_and_name)
+    assert deduced_name == 'phi-3'
+    model = MODELS.get(deduced_name)()
+    messages = [{
+        'role': 'system',
+        'content': 'you are a helpful assistant'
+    }, {
+        'role': 'user',
+        'content': 'who are you'
+    }, {
+        'role': 'assistant',
+        'content': 'I am an AI'
+    }, {
+        'role': 'user',
+        'content': 'AGI is?'
+    }]
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path_and_name,
+                                              trust_remote_code=True)
+    ref = tokenizer.apply_chat_template(messages,
+                                        tokenize=False,
+                                        add_generation_prompt=True)
+    res = model.messages2prompt(messages)
+    assert res.startswith(ref)

From ada7285ca62b10cea40125137a36450a5f8226c7 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Fri, 23 Aug 2024 21:05:58 +0800
Subject: [PATCH 39/39] Fix the logic of update engine_config to
 TurbomindModelConfig for both tm model and hf model (#2362)

* Fix the logic of update engine_config to TurbomindModelConfig for both tm model and hf model

* update group_size

* fix
---
 lmdeploy/cli/utils.py                  |  2 +-
 lmdeploy/messages.py                   |  4 ++--
 lmdeploy/turbomind/chat.py             |  7 +++----
 lmdeploy/turbomind/deploy/converter.py | 18 ++++++++++++++++--
 lmdeploy/turbomind/turbomind.py        |  4 ++++
 5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index f25be5ba43..e1f5c41ab2 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -182,7 +182,7 @@ def rope_scaling_factor(parser):
 
         return parser.add_argument('--rope-scaling-factor',
                                    type=float,
-                                   default=None,
+                                   default=0.0,
                                    help='Rope scaling factor')
 
     @staticmethod
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 89ddfb6a23..865c2249de 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -151,7 +151,7 @@ class TurbomindEngineConfig:
     cache_block_seq_len: int = 64
     enable_prefix_caching: bool = False
     quant_policy: int = 0
-    rope_scaling_factor: float = None
+    rope_scaling_factor: float = 0.0
     use_logn_attn: bool = False
     download_dir: Optional[str] = None
     revision: Optional[str] = None
@@ -165,7 +165,7 @@ def __post_init__(self):
         assert self.max_batch_size >= 1, 'max_batch_size must be a positive integer'  # noqa
         assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count'  # noqa
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
-        assert self.rope_scaling_factor is None or self.rope_scaling_factor > 0, 'invalid rope_scaling_factor'  # noqa
+        assert self.rope_scaling_factor >= 0, 'invalid rope_scaling_factor'
         assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
         assert self.num_tokens_per_iter >= 0, 'invalid num_tokens_per_iter'
 
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 7d06c95a06..ba488b77a4 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -40,7 +40,7 @@ def main(model_path: str,
          quant_policy: int = 0,
          cache_max_entry_count: float = 0.8,
          cache_block_seq_len: int = 64,
-         rope_scaling_factor: float = None,
+         rope_scaling_factor: float = 0.0,
          enable_prefix_caching: bool = False,
          session_len: int = None,
          stream_output: bool = True,
@@ -63,7 +63,7 @@ def main(model_path: str,
         quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
         cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
         cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
-        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to None. TurboMind follows the implementation of transformer LlamaAttention
+        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
         enable_prefix_caching (bool): whether enable prefix caching
         session_len (int): the length input output tokens
         stream_output (bool): indicator for streaming output or not
@@ -95,9 +95,8 @@ def main(model_path: str,
         cache_block_seq_len=cache_block_seq_len,
         enable_prefix_caching=enable_prefix_caching,
         quant_policy=quant_policy,
+        rope_scaling_factor=rope_scaling_factor,
         tp=tp)
-    if rope_scaling_factor:
-        engine_cfg.rope_scaling_factor = rope_scaling_factor
     print('engine_cfg:\n', engine_cfg, sep='', flush=True)
 
     from lmdeploy import turbomind as tm
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index e9b67a3b66..441b3cbe22 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -228,6 +228,10 @@ def get_tm_model(model_path,
         else:
             assert 0, f'unsupported quant_config: {quant_config}'
 
+    # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
+    if not group_size:
+        group_size = 128
+
     if engine_config.model_format in ['awq', 'gptq']:
         assert group_size == 128, \
             f'model format is "{engine_config.model_format}" ' \
@@ -249,14 +253,24 @@ def get_tm_model(model_path,
 
     cfg.chat_template = chat_template_name
     cfg.model_name = model_name
-    cfg.update_from_engine_config(engine_config)
+    cfg.tensor_para_size = engine_config.tp
 
     output_model = OUTPUT_MODELS.get(output_model_name)(
         input_model=input_model,
         cfg=cfg,
         exporter_factory=exporter_factory,
         out_dir=out_dir)
-
+    if engine_config.rope_scaling_factor == 0:
+        # to avoid `rope_scaling_factor` from engine_config override
+        # the rope_scaling_factor in TurbomindModelConfig
+        engine_config.rope_scaling_factor = None
+    output_model.cfg.update_from_engine_config(engine_config)
+    # cast bool to int, otherwise, the bool variables will be saved to
+    # config.ini as string
+    # TODO(lvhan): change config.ini to config.yaml
+    output_model.cfg.enable_prefix_caching = int(
+        output_model.cfg.enable_prefix_caching)
+    output_model.cfg.use_logn_attn = int(output_model.cfg.use_logn_attn)
     return output_model
 
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 1cd4ef47c9..e1ab172bf1 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -230,6 +230,10 @@ def _from_workspace(self, model_path: str,
 
         if engine_config is not None:
             engine_config.tp = cfg.tensor_para_size
+            if engine_config.rope_scaling_factor == 0:
+                # to avoid `rope_scaling_factor` from engine_config override
+                # the rope_scaling_factor in TurbomindModelConfig
+                engine_config.rope_scaling_factor = None
             cfg.update_from_engine_config(engine_config)
         if self.model_name:
             cfg.model_name = self.model_name