diff --git a/.github/workflows/cuda11.8-whl-release.yml b/.github/workflows/cuda11.8-whl-release.yml
index a762e32b74..b167b33c38 100644
--- a/.github/workflows/cuda11.8-whl-release.yml
+++ b/.github/workflows/cuda11.8-whl-release.yml
@@ -46,11 +46,12 @@ jobs:
           sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
           bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
       - name: Upload Artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           if-no-files-found: error
           path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/*
           retention-days: 1
+          name: linux-${{ matrix.pyver }}
 
   windows-build:
     strategy:
@@ -89,11 +90,12 @@ jobs:
           rm build -Force -Recurse
           python setup.py bdist_wheel -d build/wheel
       - name: Upload Artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           if-no-files-found: error
           path: build/wheel/*
           retention-days: 1
+          name: windows-${{ matrix.pyver }}
 
   publish:
     runs-on: ubuntu-latest
@@ -105,11 +107,15 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Download artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: artifact
+          merge-multiple: true
       - name: Add cuda version to package name
         run: |
           ver=$(cat lmdeploy/version.py | grep '__version__ =' | cut -d\' -f2)
           cuver=$ver+cu118
+          ls -lh
           cd artifact
           for file in *; do
             mv "$file" "`echo $file | sed "s/$ver/$cuver/g"`";
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index e7e03d44f9..bd7d6c259f 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -32,7 +32,7 @@ on:
         required: true
         description: 'Dependency packages, you can also set a specific version'
         type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord'
+        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq'
       tools_regression:
         required: true
         description: 'Whether start a tool regression'
@@ -58,11 +58,12 @@ on:
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord'}}
+  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq'}}
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
+  COV_PATH: /opt/py3/lib/python3.10/site-packages/lmdeploy
 
 jobs:
   linux-build:
@@ -170,81 +171,105 @@ jobs:
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
-          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir --cov ${{env.COV_PATH}}  || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - quantization w8a8
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization'))
         run: |
-          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - convert
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert'))
         run: |
-          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat workspace
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat hf turbomind
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - chat hf torch
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat'))
         run: |
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline turbomind
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline torch
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - pipeline turbomind vl
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline'))
         run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful turbomind
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful turbomind vl
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful workspace
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - restful torch
         continue-on-error: true
         if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful'))
         run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Test lmdeploy - local testcase
         if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case')
         run: |
-          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}}
+          pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}}|| true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
         run: |
@@ -323,7 +348,8 @@ jobs:
       - name: Test lmdeploy - restful api
         timeout-minutes: 75
         run: |
-          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Kill api server
         if: always()
         run: |
@@ -343,7 +369,8 @@ jobs:
       - name: Test lmdeploy - restful api - base
         timeout-minutes: 40
         run: |
-          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Kill api server
         if: always()
         run: |
@@ -409,10 +436,14 @@ jobs:
           rm -rf allure-results
       - name: Test lmdeploy - interface pipeline case
         run: |
-          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || true
-          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
         run: |
@@ -476,12 +507,13 @@ jobs:
           lmdeploy check_env
       - name: Test benchmark script
         run: |
-          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}}
+          pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
         run: |
           chmod -R 777 $REPORT_DIR
-          chmod -R 777 /nvme/qa_test_models/benchmark_reports/${{ github.run_id }}
+          chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -495,7 +527,7 @@ jobs:
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
     env:
-      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark_reports/${{ github.run_id }}
+      BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
     steps:
       - name: Clone repository
         uses: actions/checkout@v3
@@ -507,17 +539,69 @@ jobs:
           pip install pandas fire mmengine
           python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR
 
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    needs: [test_tools, test_restful, test_pipeline, test_benchmark]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest-cu11
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. .
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Install lmdeploy
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: |
+          python3 -m pip install lmdeploy-*.whl
+          python3 -m pip install -r requirements/test.txt
+      - name: Install lmdeploy - offline
+        if: ${{inputs.offline_mode}}
+        run: |
+          python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
   notify_to_feishu:
     if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main')
-    needs: [test_tools, test_restful, test_pipeline, get_benchmark_result]
+    needs: [get_benchmark_result, get_coverage_report]
     timeout-minutes: 5
     runs-on: [self-hosted, linux-a100]
     steps:
-      - name: fail notify
+      - name: notify
         if: contains(needs.*.result, 'failure')
         run: |
-          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test failed！！！","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
-      - name: success notify
-        if: needs.test_tools.result=='success' && needs.test_restful.result=='success' && needs.test_pipeline.result=='success'
-        run: |
-          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test success","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished！！！","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}'  ${{ secrets.FEISHU_WEBHOOK_URL }}
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 64e482c2c8..5b17fccb22 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -113,13 +113,13 @@ jobs:
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
-          python3 -m pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118
+          python3 -m pip install torch==2.3.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu118
       - name: Install lmdeploy - dependency
         run: |
           # manually install flash attn
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-          python3 -m pip install -U 'xformers<=0.0.26' --index-url https://download.pytorch.org/whl/cu118
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
           python3 -m pip install ${{env.dependency_pkgs}}
       - name: Install lmdeploy
         run: |
diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
index 7c0ae8a24d..3a19ebe870 100644
--- a/.github/workflows/pr_ete_test.yml
+++ b/.github/workflows/pr_ete_test.yml
@@ -45,7 +45,8 @@ jobs:
     steps:
       - name: Setup systems
         run: |
-          rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
+          apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
+          curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
           add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
           ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
           && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index bcb992422f..4361e17dd4 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -45,11 +45,12 @@ jobs:
           sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
           bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
       - name: Upload Artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           if-no-files-found: error
           path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/*
           retention-days: 1
+          name: linux-${{ matrix.pyver }}
 
   windows-build:
     strategy:
@@ -90,11 +91,12 @@ jobs:
           rm build -Force -Recurse
           python setup.py bdist_wheel -d build/wheel
       - name: Upload Artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           if-no-files-found: error
           path: build/wheel/*
           retention-days: 1
+          name: windows-${{ matrix.pyver }}
 
   publish:
     runs-on: ubuntu-latest
@@ -104,7 +106,10 @@ jobs:
       - windows-build
     steps:
       - name: Download artifacts
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: artifact
+          merge-multiple: true
       - name: Display artifacts
         run: ls artifact/ -lh
       - name: Set up python3.8
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index fb68b5c37b..c946177c0e 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -78,7 +78,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: ['internlm/internlm2-chat-20b']
+        model: ['internlm/internlm2_5-20b-chat']
     container:
       image: openmmlab/lmdeploy:latest-cu11
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 -e NO_PROXY=localhost,127.0.0.1 -e no_proxy=localhost,127.0.0.1 --pull never"
@@ -134,10 +134,10 @@ jobs:
           sleep 120s
       - name: Test lmdeploy - restful api
         run: |
-          python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
-          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
+          python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv &> ${{env.REPORT_DIR}}/stable.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv &> ${{env.REPORT_DIR}}/stable-internal-1.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv &> ${{env.REPORT_DIR}}/stable-internal-2.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv &> ${{env.REPORT_DIR}}/stable-internal-3.log
       - name: Kill api server
         if: always()
         run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c6c0a45bf3..62f19298d2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,7 +44,8 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"]
+
 
   - repo: https://github.com/myint/docformatter
     rev: v1.4
diff --git a/3rdparty/INIReader.h b/3rdparty/INIReader.h
deleted file mode 100644
index 6ed9b5a5aa..0000000000
--- a/3rdparty/INIReader.h
+++ /dev/null
@@ -1,501 +0,0 @@
-// Read an INI file into easy-to-access name/value pairs.
-
-// inih and INIReader are released under the New BSD license.
-// Go to the project home page for more info:
-//
-// https://github.com/benhoyt/inih (Initial repo)
-// https://github.com/jtilly/inih  (The reference of this header file)
-/* inih -- simple .INI file parser
-inih is released under the New BSD license (see LICENSE.txt). Go to the project
-home page for more info:
-https://github.com/benhoyt/inih
-https://github.com/jtilly/inih
-*/
-
-#ifndef __INI_H__
-#define __INI_H__
-
-/* Make this header file easier to include in C++ code */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdio.h>
-
-/* Typedef for prototype of handler function. */
-typedef int (*ini_handler)(void* user, const char* section,
-                           const char* name, const char* value);
-
-/* Typedef for prototype of fgets-style reader function. */
-typedef char* (*ini_reader)(char* str, int num, void* stream);
-
-/* Parse given INI-style file. May have [section]s, name=value pairs
-   (whitespace stripped), and comments starting with ';' (semicolon). Section
-   is "" if name=value pair parsed before any section heading. name:value
-   pairs are also supported as a concession to Python's configparser.
-   For each name=value pair parsed, call handler function with given user
-   pointer as well as section, name, and value (data only valid for duration
-   of handler call). Handler should return nonzero on success, zero on error.
-   Returns 0 on success, line number of first error on parse error (doesn't
-   stop on first error), -1 on file open error, or -2 on memory allocation
-   error (only when INI_USE_STACK is zero).
-*/
-int ini_parse(const char* filename, ini_handler handler, void* user);
-
-/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't
-   close the file when it's finished -- the caller must do that. */
-int ini_parse_file(FILE* file, ini_handler handler, void* user);
-
-/* Same as ini_parse(), but takes an ini_reader function pointer instead of
-   filename. Used for implementing custom or string-based I/O. */
-int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
-                     void* user);
-
-/* Nonzero to allow multi-line value parsing, in the style of Python's
-   configparser. If allowed, ini_parse() will call the handler with the same
-   name for each subsequent line parsed. */
-#ifndef INI_ALLOW_MULTILINE
-#define INI_ALLOW_MULTILINE 1
-#endif
-
-/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of
-   the file. See http://code.google.com/p/inih/issues/detail?id=21 */
-#ifndef INI_ALLOW_BOM
-#define INI_ALLOW_BOM 1
-#endif
-
-/* Nonzero to allow inline comments (with valid inline comment characters
-   specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match
-   Python 3.2+ configparser behaviour. */
-#ifndef INI_ALLOW_INLINE_COMMENTS
-#define INI_ALLOW_INLINE_COMMENTS 1
-#endif
-#ifndef INI_INLINE_COMMENT_PREFIXES
-#define INI_INLINE_COMMENT_PREFIXES ";"
-#endif
-
-/* Nonzero to use stack, zero to use heap (malloc/free). */
-#ifndef INI_USE_STACK
-#define INI_USE_STACK 1
-#endif
-
-/* Stop parsing on first error (default is to keep parsing). */
-#ifndef INI_STOP_ON_FIRST_ERROR
-#define INI_STOP_ON_FIRST_ERROR 0
-#endif
-
-/* Maximum line length for any line in INI file. */
-#ifndef INI_MAX_LINE
-#define INI_MAX_LINE 200
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-/* inih -- simple .INI file parser
-inih is released under the New BSD license (see LICENSE.txt). Go to the project
-home page for more info:
-https://github.com/benhoyt/inih
-*/
-
-#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-
-#if !INI_USE_STACK
-#include <stdlib.h>
-#endif
-
-#define MAX_SECTION 50
-#define MAX_NAME 50
-
-/* Strip whitespace chars off end of given string, in place. Return s. */
-inline static char* rstrip(char* s)
-{
-    char* p = s + strlen(s);
-    while (p > s && isspace((unsigned char)(*--p)))
-        *p = '\0';
-    return s;
-}
-
-/* Return pointer to first non-whitespace char in given string. */
-inline static char* lskip(const char* s)
-{
-    while (*s && isspace((unsigned char)(*s)))
-        s++;
-    return (char*)s;
-}
-
-/* Return pointer to first char (of chars) or inline comment in given string,
-   or pointer to null at end of string if neither found. Inline comment must
-   be prefixed by a whitespace character to register as a comment. */
-inline static char* find_chars_or_comment(const char* s, const char* chars)
-{
-#if INI_ALLOW_INLINE_COMMENTS
-    int was_space = 0;
-    while (*s && (!chars || !strchr(chars, *s)) &&
-           !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) {
-        was_space = isspace((unsigned char)(*s));
-        s++;
-    }
-#else
-    while (*s && (!chars || !strchr(chars, *s))) {
-        s++;
-    }
-#endif
-    return (char*)s;
-}
-
-/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
-inline static char* strncpy0(char* dest, const char* src, size_t size)
-{
-    strncpy(dest, src, size);
-    dest[size - 1] = '\0';
-    return dest;
-}
-
-/* See documentation in header file. */
-inline int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
-                     void* user)
-{
-    /* Uses a fair bit of stack (use heap instead if you need to) */
-#if INI_USE_STACK
-    char line[INI_MAX_LINE];
-#else
-    char* line;
-#endif
-    char section[MAX_SECTION] = "";
-    char prev_name[MAX_NAME] = "";
-
-    char* start;
-    char* end;
-    char* name;
-    char* value;
-    int lineno = 0;
-    int error = 0;
-
-#if !INI_USE_STACK
-    line = (char*)malloc(INI_MAX_LINE);
-    if (!line) {
-        return -2;
-    }
-#endif
-
-    /* Scan through stream line by line */
-    while (reader(line, INI_MAX_LINE, stream) != NULL) {
-        lineno++;
-
-        start = line;
-#if INI_ALLOW_BOM
-        if (lineno == 1 && (unsigned char)start[0] == 0xEF &&
-                           (unsigned char)start[1] == 0xBB &&
-                           (unsigned char)start[2] == 0xBF) {
-            start += 3;
-        }
-#endif
-        start = lskip(rstrip(start));
-
-        if (*start == ';' || *start == '#') {
-            /* Per Python configparser, allow both ; and # comments at the
-               start of a line */
-        }
-#if INI_ALLOW_MULTILINE
-        else if (*prev_name && *start && start > line) {
-
-#if INI_ALLOW_INLINE_COMMENTS
-        end = find_chars_or_comment(start, NULL);
-        if (*end)
-            *end = '\0';
-        rstrip(start);
-#endif
-
-            /* Non-blank line with leading whitespace, treat as continuation
-               of previous name's value (as per Python configparser). */
-            if (!handler(user, section, prev_name, start) && !error)
-                error = lineno;
-        }
-#endif
-        else if (*start == '[') {
-            /* A "[section]" line */
-            end = find_chars_or_comment(start + 1, "]");
-            if (*end == ']') {
-                *end = '\0';
-                strncpy0(section, start + 1, sizeof(section));
-                *prev_name = '\0';
-            }
-            else if (!error) {
-                /* No ']' found on section line */
-                error = lineno;
-            }
-        }
-        else if (*start) {
-            /* Not a comment, must be a name[=:]value pair */
-            end = find_chars_or_comment(start, "=:");
-            if (*end == '=' || *end == ':') {
-                *end = '\0';
-                name = rstrip(start);
-                value = lskip(end + 1);
-#if INI_ALLOW_INLINE_COMMENTS
-                end = find_chars_or_comment(value, NULL);
-                if (*end)
-                    *end = '\0';
-#endif
-                rstrip(value);
-
-                /* Valid name[=:]value pair found, call handler */
-                strncpy0(prev_name, name, sizeof(prev_name));
-                if (!handler(user, section, name, value) && !error)
-                    error = lineno;
-            }
-            else if (!error) {
-                /* No '=' or ':' found on name[=:]value line */
-                error = lineno;
-            }
-        }
-
-#if INI_STOP_ON_FIRST_ERROR
-        if (error)
-            break;
-#endif
-    }
-
-#if !INI_USE_STACK
-    free(line);
-#endif
-
-    return error;
-}
-
-/* See documentation in header file. */
-inline int ini_parse_file(FILE* file, ini_handler handler, void* user)
-{
-    return ini_parse_stream((ini_reader)fgets, file, handler, user);
-}
-
-/* See documentation in header file. */
-inline int ini_parse(const char* filename, ini_handler handler, void* user)
-{
-    FILE* file;
-    int error;
-
-    file = fopen(filename, "r");
-    if (!file)
-        return -1;
-    error = ini_parse_file(file, handler, user);
-    fclose(file);
-    return error;
-}
-
-#endif /* __INI_H__ */
-
-
-#ifndef __INIREADER_H__
-#define __INIREADER_H__
-
-#include <map>
-#include <set>
-#include <string>
-
-// Read an INI file into easy-to-access name/value pairs. (Note that I've gone
-// for simplicity here rather than speed, but it should be pretty decent.)
-class INIReader
-{
-public:
-    // Empty Constructor
-    INIReader() {};
-
-    // Construct INIReader and parse given filename. See ini.h for more info
-    // about the parsing.
-    INIReader(std::string filename);
-
-    // Construct INIReader and parse given file. See ini.h for more info
-    // about the parsing.
-    INIReader(FILE *file);
-    ~INIReader();
-    // Return the result of ini_parse(), i.e., 0 on success, line number of
-    // first error on parse error, or -1 on file open error.
-    int ParseError() const;
-
-    // Return the list of sections found in ini file
-    const std::set<std::string>& Sections() const;
-
-    // Get a string value from INI file, returning default_value if not found.
-    std::string Get(std::string section, std::string name,
-                    std::string default_value) const;
-    std::string Get(std::string section, std::string name) const;
-
-    // Get an integer (long) value from INI file, returning default_value if
-    // not found or not a valid integer (decimal "1234", "-1234", or hex "0x4d2").
-    long GetInteger(std::string section, std::string name, long default_value) const;
-    long GetInteger(std::string section, std::string name) const;
-
-    // Get a real (floating point double) value from INI file, returning
-    // default_value if not found or not a valid floating point value
-    // according to strtod().
-    double GetReal(std::string section, std::string name, double default_value) const;
-
-    // Get a single precision floating point number value from INI file, returning
-    // default_value if not found or not a valid floating point value
-    // according to strtof().
-    float GetFloat(std::string section, std::string name, float default_value) const;
-    float GetFloat(std::string section, std::string name) const;
-
-    // Get a boolean value from INI file, returning default_value if not found or if
-    // not a valid true/false value. Valid true values are "true", "yes", "on", "1",
-    // and valid false values are "false", "no", "off", "0" (not case sensitive).
-    bool GetBoolean(std::string section, std::string name, bool default_value) const;
-
-protected:
-    int _error;
-    std::map<std::string, std::string> _values;
-    std::set<std::string> _sections;
-    static std::string MakeKey(std::string section, std::string name);
-    static int ValueHandler(void* user, const char* section, const char* name,
-                            const char* value);
-};
-
-#endif  // __INIREADER_H__
-
-
-#ifndef __INIREADER__
-#define __INIREADER__
-
-#include <algorithm>
-#include <cctype>
-#include <cstdlib>
-
-inline INIReader::INIReader(std::string filename)
-{
-    _error = ini_parse(filename.c_str(), ValueHandler, this);
-}
-
-inline INIReader::INIReader(FILE *file)
-{
-    _error = ini_parse_file(file, ValueHandler, this);
-}
-
-inline int INIReader::ParseError() const
-{
-    return _error;
-}
-
-inline INIReader::~INIReader() { }
-
-inline const std::set<std::string>& INIReader::Sections() const
-{
-    return _sections;
-}
-
-inline std::string INIReader::Get(std::string section, std::string name, std::string default_value) const
-{
-    std::string key = MakeKey(section, name);
-    return _values.count(key) ? _values.at(key) : default_value;
-}
-
-inline std::string INIReader::Get(std::string section, std::string name) const
-{
-    std::string key = MakeKey(section, name);
-    if(_values.count(key)) return _values.at(key);
-    else
-    {
-        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
-        exit(-1);
-    }
-}
-
-inline long INIReader::GetInteger(std::string section, std::string name, long default_value) const
-{
-    std::string valstr = Get(section, name, "");
-    const char* value = valstr.c_str();
-    char* end;
-    // This parses "1234" (decimal) and also "0x4D2" (hex)
-    long n = strtol(value, &end, 0);
-    return end > value ? n : default_value;
-}
-
-inline long INIReader::GetInteger(std::string section, std::string name) const
-{
-    std::string valstr = Get(section, name, "");
-    const char* value = valstr.c_str();
-    char* end;
-    // This parses "1234" (decimal) and also "0x4D2" (hex)
-    long n = strtol(value, &end, 0);
-    if(end <= value)
-    {
-        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
-        exit(-1);
-    }
-    return n;
-}
-
-inline double INIReader::GetReal(std::string section, std::string name, double default_value) const
-{
-    std::string valstr = Get(section, name, "");
-    const char* value = valstr.c_str();
-    char* end;
-    double n = strtod(value, &end);
-    return end > value ? n : default_value;
-}
-
-inline float INIReader::GetFloat(std::string section, std::string name, float default_value) const
-{
-    std::string valstr = Get(section, name, "");
-    const char* value = valstr.c_str();
-    char* end;
-    float n = strtof(value, &end);
-    return end > value ? n : default_value;
-}
-
-inline float INIReader::GetFloat(std::string section, std::string name) const
-{
-    std::string valstr = Get(section, name, "");
-    const char* value = valstr.c_str();
-    char* end;
-    float n = strtof(value, &end);
-    if(end <= value)
-    {
-        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
-        exit(-1);
-    }
-    return n;
-}
-
-inline bool INIReader::GetBoolean(std::string section, std::string name, bool default_value) const
-{
-    std::string valstr = Get(section, name, "");
-    // Convert to lower case to make string comparisons case-insensitive
-    std::transform(valstr.begin(), valstr.end(), valstr.begin(), ::tolower);
-    if (valstr == "true" || valstr == "yes" || valstr == "on" || valstr == "1")
-        return true;
-    else if (valstr == "false" || valstr == "no" || valstr == "off" || valstr == "0")
-        return false;
-    else
-        return default_value;
-}
-
-inline std::string INIReader::MakeKey(std::string section, std::string name)
-{
-    std::string key = section + "=" + name;
-    // Convert to lower case to make section/name lookups case-insensitive
-    std::transform(key.begin(), key.end(), key.begin(), ::tolower);
-    return key;
-}
-
-inline int INIReader::ValueHandler(void* user, const char* section, const char* name,
-                            const char* value)
-{
-    INIReader* reader = (INIReader*)user;
-    std::string key = MakeKey(section, name);
-    if (reader->_values[key].size() > 0)
-        reader->_values[key] += "\n";
-    reader->_values[key] += value;
-    reader->_sections.insert(section);
-    return 1;
-}
-
-#endif  // __INIREADER__
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d5abdad22..4e996d1855 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,15 @@ if (BUILD_TEST)
   set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
 endif()
 
+FetchContent_Declare(
+  yaml-cpp
+  GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
+  GIT_TAG 0.8.0
+)
+set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp")
+FetchContent_MakeAvailable(yaml-cpp)
+
+
 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
 
 option(BUILD_FAST_MATH "Build in fast math mode" ON)
diff --git a/README.md b/README.md
index a0db34e369..e26d120a71 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
 
 [📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) |
-[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) |
+[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started/get_started.html) |
 [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
 
 English | [简体中文](README_zh-CN.md) | [日本語](README_ja.md)
@@ -180,7 +180,7 @@ pip install lmdeploy
 ```
 
 The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
-For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md).
+For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](docs/en/get_started/installation.md).
 
 ## Offline Batch Inference
 
@@ -200,7 +200,7 @@ For more information about inference pipeline, please refer to [here](docs/en/ll
 
 # Tutorials
 
-Please review [getting_started](./docs/en/get_started.md) section for the basic usage of LMDeploy.
+Please review [getting_started](docs/en/get_started/get_started.md) section for the basic usage of LMDeploy.
 
 For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):
 
diff --git a/README_ja.md b/README_ja.md
index 94e3eb7b6c..9313397435 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -8,7 +8,7 @@
 [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
 
 [📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) |
-[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) |
+[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started/get_started.html) |
 [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
 
 [English](README.md) | [简体中文](README_zh-CN.md) | 日本語
@@ -181,7 +181,7 @@ pip install lmdeploy
 ```
 
 v0.3.0から、デフォルトの事前構築済みパッケージはCUDA 12でコンパイルされています。
-CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/installation.md)参照してください。
+CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/get_started/installation.md)参照してください。
 
 ## オフラインバッチ推論
 
@@ -201,7 +201,7 @@ print(response)
 
 # チュートリアル
 
-LMDeployの基本的な使用方法については、[getting_started](./docs/en/get_started.md)セクションを参照してください。
+LMDeployの基本的な使用方法については、[getting_started](docs/en/get_started/get_started.md)セクションを参照してください。
 
 詳細なユーザーガイドと高度なガイドについては、[チュートリアル](https://lmdeploy.readthedocs.io/en/latest/)を参照してください：
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 79d551e3e3..7332241676 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -8,7 +8,7 @@
 [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
 
 [📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) |
-[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) |
+[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started/get_started.html) |
 [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
 
 [English](README.md) | 简体中文 | [日本語](README_ja.md)
@@ -180,7 +180,7 @@ conda activate lmdeploy
 pip install lmdeploy
 ```
 
-自 v0.3.0 起，LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，或者源码安装 LMDeploy，请参考[安装文档](./docs/zh_cn/installation.md)
+自 v0.3.0 起，LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy，或者源码安装 LMDeploy，请参考[安装文档](docs/zh_cn/get_started/installation.md)
 
 ## 离线批处理
 
@@ -200,7 +200,7 @@ print(response)
 
 # 用户教程
 
-请阅读[快速上手](./docs/zh_cn/get_started.md)章节，了解 LMDeploy 的基本用法。
+请阅读[快速上手](docs/zh_cn/get_started/get_started.md)章节，了解 LMDeploy 的基本用法。
 
 为了帮助用户更进一步了解 LMDeploy，我们准备了用户指南和进阶指南，请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/)：
 
diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py
index 5ac0335660..761cf0302b 100644
--- a/autotest/benchmark/test_apiserver_performance.py
+++ b/autotest/benchmark/test_apiserver_performance.py
@@ -88,13 +88,13 @@ def test_restful_tp4(config, run_id, prepare_environment, worker_id):
 @pytest.mark.function
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('prepare_environment', [{
-    'model': 'internlm/internlm2-chat-20b',
+    'model': 'internlm/internlm2_5-20b-chat',
     'backend': 'pytorch',
     'tp_num': 2,
     'extra': '--max-batch-size 256 --cache-max-entry-count 0.9',
     'cuda_prefix': None
 }, {
-    'model': 'internlm/internlm2-chat-20b-inner-4bits',
+    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
     'backend': 'turbomind',
     'quant_policy': 0,
     'tp_num': 2,
@@ -106,7 +106,8 @@ def test_restful_func_tp2(config, run_id, prepare_environment, worker_id):
     result, restful_log, msg = restful_test(config,
                                             run_id,
                                             prepare_environment,
-                                            worker_id=worker_id)
+                                            worker_id=worker_id,
+                                            is_smoke=True)
 
     if restful_log is not None:
         allure.attach.file(restful_log,
diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py
index 62a9d53baf..cffdc53270 100644
--- a/autotest/benchmark/test_generation_performance.py
+++ b/autotest/benchmark/test_generation_performance.py
@@ -117,11 +117,11 @@ def test_generation_longtext_tp4(config, run_id, run_config, worker_id):
 @pytest.mark.function
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', [{
-    'model': 'internlm/internlm2-chat-20b',
+    'model': 'internlm/internlm2_5-20b-chat',
     'backend': 'pytorch',
     'tp_num': 2
 }, {
-    'model': 'internlm/internlm2-chat-20b-inner-4bits',
+    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
     'backend': 'turbomind',
     'quant_policy': 0,
     'tp_num': 2
diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
index fe2422c3f9..ad44b22b43 100644
--- a/autotest/benchmark/test_throughput_performance.py
+++ b/autotest/benchmark/test_throughput_performance.py
@@ -62,11 +62,11 @@ def test_throughput_tp4(config, run_id, run_config, worker_id):
 @pytest.mark.function
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', [{
-    'model': 'internlm/internlm2-chat-20b',
+    'model': 'internlm/internlm2_5-20b-chat',
     'backend': 'pytorch',
     'tp_num': 2
 }, {
-    'model': 'internlm/internlm2-chat-20b-inner-4bits',
+    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
     'backend': 'turbomind',
     'quant_policy': 0,
     'tp_num': 2
@@ -77,7 +77,8 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id):
         run_id,
         run_config,
         cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2),
-        worker_id=worker_id)
+        worker_id=worker_id,
+        is_smoke=True)
 
     if throughput_log is not None:
         allure.attach.file(throughput_log,
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 71786ffdbb..b7c928909a 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -1,7 +1,7 @@
 model_path: /nvme/qa_test_models
 dst_path: /nvme/qa_test_models/autotest_model
 log_path: /nvme/qa_test_models/autotest_model/log
-benchmark_path: /nvme/qa_test_models/benchmark_reports
+benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 
 tp_config:
@@ -16,7 +16,6 @@ tp_config:
     Meta-Llama-3-1-70B-Instruct: 4
     internlm2_5-7b-chat-1m: 4
     Qwen2-7B-Instruct-GPTQ-Int4: 2
-
     InternVL2-40B: 2
 
 turbomind_chat_model:
@@ -45,6 +44,7 @@ turbomind_chat_model:
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
     - Qwen/Qwen-VL-Chat
+    - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mistral-7B-Instruct-v0.3
@@ -104,6 +104,7 @@ turbomind_base_model:
 
 pytorch_base_model:
     - tiiuae/falcon-7b
+    - internlm/internlm2_5-7b
     - internlm/internlm2_5-1_8b
     - internlm/internlm2-20b
 
@@ -160,8 +161,9 @@ turbomind_quatization:
         - baichuan-inc/Baichuan2-7B-Chat
         - codellama/CodeLlama-7b-hf
         - openbmb/MiniCPM-Llama3-V-2_5
+        - THUDM/glm-4-9b-chat
     gptq:
-        - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
+        - internlm/internlm2_5-7b-chat
     kvint:
         - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
@@ -205,9 +207,11 @@ pytorch_quatization:
         - internlm/internlm2_5-20b-chat
         - internlm/internlm2-chat-7b
         - internlm/internlm2-chat-20b
+        - OpenGVLab/InternVL-Chat-V1-5
         - 01-ai/Yi-6B-Chat
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
+        - microsoft/Phi-3-mini-4k-instruct
     w8a8:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-2-7b-chat-hf
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index a90e7775f5..9ee793a895 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -17,7 +17,7 @@
                       TurbomindEngineConfig, pipeline)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_prompt(config, model, backend, worker_id):
@@ -47,7 +47,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_prompt_stream(config, model, backend, worker_id):
@@ -79,7 +79,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_multi_prompt(config, model, backend, worker_id):
@@ -109,7 +109,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_multi_prompt_stream(config, model, backend, worker_id):
@@ -141,7 +141,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message(config, model, backend, worker_id):
@@ -172,7 +172,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_stream(config, model, backend, worker_id):
@@ -204,7 +204,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_batch(config, model, backend, worker_id):
@@ -241,7 +241,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_batch_stream(config, model, backend, worker_id):
@@ -279,7 +279,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_return_check_logprobs(config, model, backend, worker_id):
 
@@ -309,7 +309,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_return_check_logprobs_stream(config, model, backend, worker_id):
 
@@ -343,7 +343,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_backend_config_session_len(config, model, backend, worker_id):
@@ -377,7 +377,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_min_new_tokens(config, model, backend, worker_id):
@@ -413,7 +413,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_stop_words(config, model, backend, worker_id):
@@ -435,7 +435,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
             result &= ' and' not in response[
                 i].text and ' to ' not in response[i].text
             result &= response[i].finish_reason == 'stop' and response[
-                i].generate_token_len < 20
+                i].generate_token_len < 50
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
         torch.cuda.empty_cache()
@@ -454,7 +454,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_bad_words(config, model, backend, worker_id):
@@ -492,7 +492,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_special_words_false(config, model, backend, worker_id):
@@ -533,7 +533,7 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_special_words_true(config, model, backend, worker_id):
@@ -574,7 +574,7 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimum_repetition_penalty(config, model, backend,
@@ -590,8 +590,9 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
         gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1)
         response = pipe('Shanghai is', gen_config=gen_config)
 
-        result = 'a 上海 is a 上海, ' * 10 in response.text or get_repeat_times(
-            response.text, 'Shanghai is') > 5
+        result = get_repeat_times(response.text,
+                                  'is a name') > 5 or get_repeat_times(
+                                      response.text, 'Shanghai is') > 5
         save_pipeline_common_log(config, file_name, result, response)
         del pipe
         torch.cuda.empty_cache()
@@ -610,7 +611,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend,
@@ -644,7 +645,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend,
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimun_topp(config, model, backend, worker_id):
@@ -676,7 +677,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimun_topk(config, model, backend, worker_id):
@@ -711,7 +712,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_diff_random_seed(config, model, backend, worker_id):
@@ -747,7 +748,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_same_random_seed(config, model, backend, worker_id):
@@ -781,7 +782,7 @@ def run_pipeline_testcase(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_max_new_tokens(config, model, backend, worker_id):
@@ -819,7 +820,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend,
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_ignore_eos(config, model, backend, worker_id):
@@ -856,7 +857,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend',
                          [TurbomindEngineConfig, PytorchEngineConfig])
 def test_backend_config_input_validation(config, model, backend, worker_id):
@@ -896,7 +897,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_validate_turbomind(config, model, backend, worker_id):
     if 'gw' in worker_id:
@@ -936,7 +937,7 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [PytorchEngineConfig])
 def test_backend_config_validate_pytorch(config, model, backend, worker_id):
     if 'gw' in worker_id:
@@ -967,7 +968,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
         del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_tp(config, model, backend, worker_id):
     with pytest.raises(AssertionError, match='tp should be 2\\^n'):
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
index 8e1f183905..88b8a2847e 100644
--- a/autotest/interface/pipeline/test_pipeline_longtext_func.py
+++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -19,7 +19,7 @@
 @pytest.mark.gpu_num_1
 @pytest.mark.parametrize('model', [
     'internlm/internlm2-chat-7b', 'internlm/internlm2_5-7b',
-    'internlm/internlm2-chat-1_8b', 'internlm/internlm2-1_8b'
+    'internlm/internlm2-chat-1_8b'
 ])
 def test_history_issue_tp1(config, model, worker_id):
     log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log'])
@@ -147,7 +147,6 @@ def passkey_retrival(config,
                                                    tp=tp_num)
         else:
             backend_config = TurbomindEngineConfig(session_len=session_len,
-                                                   use_logn_attn=True,
                                                    tp=tp_num)
     else:
         if 'internlm2_5' in model and '-1m' in model:
diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py
index 3e2ca31664..83e52d83c9 100644
--- a/autotest/interface/restful/test_restful_completions_v1.py
+++ b/autotest/interface/restful/test_restful_completions_v1.py
@@ -188,3 +188,14 @@ def test_completions_stream_stopwords(self):
         assert output_last.get('choices')[0].get('finish_reason') in [
             'stop', 'length'
         ]
+
+    def test_batch_prompt_order(self):
+        api_client = APIClient(BASE_URL)
+        model_name = api_client.available_models[0]
+        for item in api_client.completions_v1(
+                model=model_name,
+                prompt=['你好', '今天天气怎么样', '你是谁', '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'],
+                max_tokens=200):
+            assert '天气' in item.get('choices')[1].get('text')
+            assert '梅' in item.get('choices')[3].get('text')
+            assert '7' in item.get('choices')[4].get('text')
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 3ee8608604..642f87ec28 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -56,7 +56,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.hf_pytorch_chat
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 def test_hf_pytorch_chat_pr(config, model, cli_case_config):
     usercase = 'chat_testcase'
     result, chat_log, msg = hf_command_line_test(
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 5f5e1fde59..2f13898fec 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -106,9 +106,10 @@ def test_hf_turbomind_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.hf_turbomind_chat
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize(
-    'model',
-    ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'internlm/internlm2_5-20b-chat',
+    'internlm/internlm2_5-20b-chat-inner-4bits'
+])
 def test_hf_turbomind_chat_pr(config, model, cli_case_config):
     usercase = 'chat_testcase'
 
diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py
index ee7b2ddc47..a16d4e32f6 100644
--- a/autotest/tools/chat/test_command_chat_workspace.py
+++ b/autotest/tools/chat/test_command_chat_workspace.py
@@ -97,9 +97,10 @@ def test_workspace_base_tp2(config, cli_case_config, model, worker_id):
 @pytest.mark.command_chat
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize(
-    'model',
-    ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'internlm/internlm2_5-20b-chat',
+    'internlm/internlm2_5-20b-chat-inner-4bits'
+])
 def test_workspace_chat_pr(config, cli_case_config, model):
     usercase = 'chat_testcase'
     result, chat_log, msg = command_line_test(
diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py
index a8f0859275..9d194f1ea3 100644
--- a/autotest/tools/convert/test_convert.py
+++ b/autotest/tools/convert/test_convert.py
@@ -22,9 +22,10 @@ def test_convert(config, model, worker_id):
 @pytest.mark.convert
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize(
-    'model',
-    ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'internlm/internlm2_5-20b-chat',
+    'internlm/internlm2_5-20b-chat-inner-4bits'
+])
 def test_convert_pr(config, model):
     convert(config, model, 'CUDA_VISIBLE_DEVICES=5')
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
index 778ff73c7e..8f56225ebc 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
@@ -63,7 +63,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
+@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
 def test_pipeline_chat_pytorch_pr(config, common_case_config, model):
     p = Process(target=run_pipeline_chat_test,
                 args=(config, common_case_config, model, 'pytorch'))
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
index 6373549698..d92af06ecb 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
@@ -100,9 +100,10 @@ def test_pipeline_chat_kvint_tp2(config, common_case_config, model,
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
-@pytest.mark.parametrize(
-    'model',
-    ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits'])
+@pytest.mark.parametrize('model', [
+    'internlm/internlm2_5-20b-chat',
+    'internlm/internlm2_5-20b-chat-inner-4bits'
+])
 def test_pipeline_chat_pr(config, common_case_config, model):
     p = Process(target=run_pipeline_chat_test,
                 args=(config, common_case_config, model, 'turbomind'))
diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
index a4c6b26043..aaaabbc6f3 100644
--- a/autotest/tools/quantization/test_quantization_awq.py
+++ b/autotest/tools/quantization/test_quantization_awq.py
@@ -11,7 +11,17 @@
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize('model', get_quantization_model_list('awq'))
 def test_quantization_awq(config, model, worker_id):
-    quantization_awq(config, model + '-inner-4bits', model,
+    quantization_type = 'awq'
+    quantization_all(config, model + '-inner-4bits', model, quantization_type,
+                     get_cuda_prefix_by_workerid(worker_id))
+
+
+@pytest.mark.order(3)
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize('model', get_quantization_model_list('gptq'))
+def test_quantization_gptq(config, model, worker_id):
+    quantization_type = 'gptq'
+    quantization_all(config, model + '-inner-gptq', model, quantization_type,
                      get_cuda_prefix_by_workerid(worker_id))
 
 
@@ -22,14 +32,15 @@ def test_quantization_awq(config, model, worker_id):
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize(
     'model, prefix',
-    [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')])
+    [('internlm/internlm2_5-20b-chat', 'CUDA_VISIBLE_DEVICES=5')])
 def test_quantization_awq_pr(config, model, prefix):
-    quantization_awq(config, model + '-inner-4bits', model, prefix)
+    quantization_type = 'awq'
+    quantization_all(config, model + '-inner-4bits', model, quantization_type,
+                     prefix)
 
 
-def quantization_awq(config, quantization_model_name, origin_model_name,
-                     cuda_prefix):
-    quantization_type = 'awq'
+def quantization_all(config, quantization_model_name, origin_model_name,
+                     quantization_type, cuda_prefix):
     result, msg = quantization(config, quantization_model_name,
                                origin_model_name, quantization_type,
                                cuda_prefix)
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
index 4046cf38c2..c9fade16a4 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
@@ -117,11 +117,11 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
-    'model': 'internlm/internlm2-chat-20b',
+    'model': 'internlm/internlm2_5-20b-chat',
     'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
     'tp_num': 2
 }, {
-    'model': 'internlm/internlm2-chat-20b-inner-4bits',
+    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
     'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
     'tp_num': 2
 }],
diff --git a/autotest/tools/restful/test_restful_chat_workspace.py b/autotest/tools/restful/test_restful_chat_workspace.py
index 17205a9d95..798a43d7b0 100644
--- a/autotest/tools/restful/test_restful_chat_workspace.py
+++ b/autotest/tools/restful/test_restful_chat_workspace.py
@@ -69,11 +69,11 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
-    'model': 'internlm/internlm2-chat-20b',
+    'model': 'internlm/internlm2_5-20b-chat',
     'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
     'tp_num': 2
 }, {
-    'model': 'internlm/internlm2-chat-20b-inner-4bits',
+    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
     'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
     'tp_num': 2
 }],
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 12aa260c37..9356f40a3b 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -165,7 +165,7 @@ def restful_test(config,
 
     command = f'python3 benchmark/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True '  # noqa: F401, E501
     if is_smoke:
-        command += ' --num-prompts 300'
+        command += ' --num-prompts 200'
     else:
         command += ' --num-prompts 2000'
 
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 539280c7ee..1cc556748e 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -13,8 +13,9 @@ def get_turbomind_model_list(tp_num: int = None,
     for key in quatization_case_config.get('awq'):
         if key in case_list:
             case_list.append(key + '-inner-4bits')
-    if model_type == 'chat_model':
-        case_list += quatization_case_config.get('gptq')
+    for key in quatization_case_config.get('gptq'):
+        if key in case_list:
+            case_list.append(key + '-inner-gptq')
 
     if tp_num is not None:
         return [
@@ -54,13 +55,11 @@ def get_all_model_list(tp_num: int = None, model_type: str = 'chat_model'):
     turbomind_quantization_config = config.get('turbomind_quatization')
     pytorch_quantization_config = config.get('pytorch_quatization')
     for key in turbomind_quantization_config.get(
-            'awq') + pytorch_quantization_config.get('awq'):
+            'awq') + pytorch_quantization_config.get(
+                'awq') + turbomind_quantization_config.get('gptq'):
         if key in case_list and key + '-inner-4bits' not in case_list:
             case_list.append(key + '-inner-4bits')
 
-    if model_type == 'chat_model':
-        case_list += turbomind_quantization_config.get('gptq')
-
     if tp_num is not None:
         return [
             item for item in case_list if get_tp_num(config, item) == tp_num
@@ -85,6 +84,9 @@ def get_kvint_model_list(tp_num: int = None, model_type: str = 'chat_model'):
     for key in config.get('turbomind_quatization').get('awq'):
         if key in case_list_base and key in case_list:
             case_list.append(key + '-inner-4bits')
+    for key in config.get('turbomind_quatization').get('gptq'):
+        if key in case_list_base and key in case_list:
+            case_list.append(key + '-inner-gptq')
 
     if tp_num is not None:
         return [
@@ -104,6 +106,8 @@ def get_quantization_model_list(type):
         return case_list
     if type == 'kvint':
         return config.get('turbomind_quatization').get(type)
+    if type == 'gptq':
+        return config.get('turbomind_quatization').get(type)
     if type == 'w8a8':
         return config.get('pytorch_quatization').get(type)
     return []
diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
index 153a854bb7..75b7319aeb 100644
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
@@ -23,13 +23,18 @@ def quantization(config,
             cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
             '--work-dir', quantization_model_path, '--batch-size 32'
         ])
+    elif quantization_type == 'gptq':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path,
+            '--work-dir', quantization_model_path, '--batch-size 32'
+        ])
     elif quantization_type == 'w8a8':
         quantization_cmd = ' '.join([
             cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
             '--work-dir', quantization_model_path, '--batch-size 32'
         ])
     else:
-        return False, 'quantization type should in [awq, w8a8], \
+        return False, 'quantization type should in [awq, gptq, w8a8], \
             now the type is ' + quantization_type
 
     if 'llama-3' in origin_model_name.lower():
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index b6dd6a5f48..edc2268e30 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -104,7 +104,7 @@ def command_test(config,
         file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n')
 
         spliter = '\n\n'
-        if 'codellama' in model.lower() and ' chat ' in cmd:
+        if 'codellama' in model.lower() and 'serve' not in ' '.join(cmd):
             spliter = '\n!!\n'
         # join prompt together
         prompt = ''
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index 81de3dbf45..89c07fb196 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -16,7 +16,7 @@
 from tqdm import tqdm
 
 from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
-from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
                                TurbomindEngineConfig)
 from lmdeploy.utils import get_logger
 
@@ -25,7 +25,7 @@
 
 
 def infer(model, session_id: int, input_ids: List,
-          gen_config: EngineGenerationConfig, test_round: int, que: Queue):
+          gen_config: GenerationConfig, test_round: int, que: Queue):
     if session_id == 1:
         pbar = tqdm(total=test_round)
     chatbot = model.create_instance()
@@ -73,7 +73,7 @@ def infer(model, session_id: int, input_ids: List,
 
 
 def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int,
-           gen_config: EngineGenerationConfig):
+           gen_config: GenerationConfig):
     if not warmup_round:
         return
 
@@ -110,7 +110,7 @@ def _infer(model, session_id):
 def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
                        engine_config: Union[PytorchEngineConfig,
                                             TurbomindEngineConfig],
-                       gen_config: EngineGenerationConfig, test_round: int,
+                       gen_config: GenerationConfig, test_round: int,
                        warmup_round: int):
     output_seqlen = gen_config.max_new_tokens
     print(f'profiling ... concurrency: {concurrency}, '
@@ -424,12 +424,11 @@ def main():
                     thread_safe=True,
                     enable_prefix_caching=args.enable_prefix_caching,
                 )
-            gen_config = EngineGenerationConfig(
-                top_k=args.top_k,
-                top_p=args.top_p,
-                temperature=args.temperature,
-                max_new_tokens=completion_tokens,
-                ignore_eos=True)
+            gen_config = GenerationConfig(top_k=args.top_k,
+                                          top_p=args.top_p,
+                                          temperature=args.temperature,
+                                          max_new_tokens=completion_tokens,
+                                          ignore_eos=True)
             profile_target = partial(
                 profile_throughput,
                 concurrency=batch,
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index be7c1035e8..23fa317810 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
-from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
                                TurbomindEngineConfig)
 from lmdeploy.pytorch.engine import EngineInstance
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
@@ -105,12 +105,11 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             for outputs in model_inst.stream_infer(
                     session_id,
                     input_ids=input_ids,
-                    gen_config=EngineGenerationConfig(
-                        max_new_tokens=output_seqlen,
-                        temperature=temperature,
-                        top_p=top_p,
-                        top_k=top_k,
-                        ignore_eos=True),
+                    gen_config=GenerationConfig(max_new_tokens=output_seqlen,
+                                                temperature=temperature,
+                                                top_p=top_p,
+                                                top_k=top_k,
+                                                ignore_eos=True),
                     sequence_start=True,
                     sequence_end=True,
                     stream_output=stream_output):
diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend
new file mode 100644
index 0000000000..058ec6a905
--- /dev/null
+++ b/docker/Dockerfile_aarch64_ascend
@@ -0,0 +1,171 @@
+FROM ubuntu:20.04 as export_image
+
+WORKDIR /tmp
+
+ARG http_proxy
+ARG https_proxy
+ARG PYVERSION=3.10.5
+ARG DEBIAN_FRONTEND=noninteractive
+ARG CHIP=all
+ARG ASCEND_BASE=/usr/local/Ascend
+ARG TOOLKIT_PKG=Ascend-cann-toolkit_*.run
+ARG KERNELS_PKG=Ascend-cann-kernels-*.run
+ARG TOOLKIT_PATH=$ASCEND_BASE/ascend-toolkit/latest
+ARG DEEPLINK_TAG_OR_COMMIT=6012186b03cff6eac6587e7a06dbaa590af6d5df
+ARG DEEPLINKEXT_TAG_OR_COMMIT=525678f2c4c227e1e8bf358259a19a578b67bc37
+ARG LMDEPLOY_TAG_OR_COMMIT=v0.6.0a0
+
+RUN sed -i 's@http://.*.ubuntu.com@http://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list && \
+    apt update && \
+    apt install --no-install-recommends ca-certificates -y && \
+    apt install --no-install-recommends bc wget -y && \
+    apt install --no-install-recommends curl gcc make g++ pkg-config unzip -y && \
+    apt install --no-install-recommends libsqlite3-dev libblas3 liblapack3 gfortran vim -y && \
+    apt install --no-install-recommends liblapack-dev libblas-dev libhdf5-dev libffi-dev -y && \
+    apt install --no-install-recommends libssl-dev zlib1g-dev xz-utils cython3 python3-h5py -y && \
+    apt install --no-install-recommends libopenblas-dev libgmpxx4ldbl liblzma-dev -y && \
+    apt install --no-install-recommends libicu66 libxml2 pciutils libgl1-mesa-glx libbz2-dev -y && \
+    apt install --no-install-recommends libreadline-dev libncurses5 libncurses5-dev libncursesw5 -y && \
+    apt install --no-install-recommends git gdb gcc-7 g++-7 -y && \
+    sed -i 's@http://mirrors.tuna.tsinghua.edu.cn@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list && \
+    apt clean && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 7 \
+    --slave /usr/bin/g++ g++ /usr/bin/g++-7 --slave /usr/bin/gcov gcov /usr/bin/gcov-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9 \
+    --slave /usr/bin/g++ g++ /usr/bin/g++-9 --slave /usr/bin/gcov gcov /usr/bin/gcov-9 && \
+    update-alternatives --set gcc $(update-alternatives --list gcc | grep gcc-7)
+
+ENV LD_LIBRARY_PATH=/usr/local/python${PYVERSION}/lib: \
+    PATH=/usr/local/python${PYVERSION}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+RUN umask 0022  && \
+    wget https://repo.huaweicloud.com/python/${PYVERSION}/Python-${PYVERSION}.tar.xz && \
+    tar -xf Python-${PYVERSION}.tar.xz && cd Python-${PYVERSION} && ./configure --prefix=/usr/local/python${PYVERSION} --enable-shared && \
+    make -j 16 && make install && \
+    ln -sf /usr/local/python${PYVERSION}/bin/python3 /usr/bin/python3 && \
+    ln -sf /usr/local/python${PYVERSION}/bin/python3 /usr/bin/python && \
+    ln -sf /usr/local/python${PYVERSION}/bin/pip3 /usr/bin/pip3 && \
+    ln -sf /usr/local/python${PYVERSION}/bin/pip3 /usr/bin/pip && \
+    cd .. && \
+    rm -rf Python* && \
+    mkdir -p ~/.pip  && \
+    echo '[global] \n\
+    index-url=http://mirrors.aliyun.com/pypi/simple\n\
+    trusted-host=mirrors.aliyun.com' >> ~/.pip/pip.conf && \
+    pip3 install pip -U
+
+RUN pip3 install -U pip && \
+    pip3 install wheel==0.43.0 scikit-build==0.18.0 numpy==1.24 setuptools==69.5.1 && \
+    pip3 install decorator sympy cffi && \
+    pip3 install cmake ninja pyyaml && \
+    pip3 install pathlib2 protobuf attrs attr scipy && \
+    pip3 install requests psutil absl-py && \
+    pip3 install torch==2.1.1 torchvision==0.16.1 --index-url=https://download.pytorch.org/whl/cpu && \
+    pip3 install transformers==4.41.0 && \
+    rm -rf /root/.cache/pip
+
+ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH
+ENV LD_PRELOAD=/lib/aarch64-linux-gnu/libGLdispatch.so.0:$LD_PRELOAD
+
+RUN if [ ! -d "/lib64" ]; \
+    then \
+        mkdir /lib64 && ln -sf /lib/ld-linux-aarch64.so.1 /lib64/ld-linux-aarch64.so.1; \
+    fi
+
+FROM ubuntu:20.04 as buildtemp
+COPY ./*.run /tmp
+
+FROM export_image
+
+ENV LD_LIBRARY_PATH=\
+$ASCEND_BASE/driver/lib64:\
+$ASCEND_BASE/driver/lib64/common:\
+$ASCEND_BASE/driver/lib64/driver:\
+$ASCEND_BASE/driver/tools/hccn_tool/:\
+$TOOLKIT_PATH/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64/:\
+$LD_LIBRARY_PATH
+
+RUN --mount=type=cache,target=/tmp,from=buildtemp,source=/tmp \
+    umask 0022 && \
+    mkdir -p $ASCEND_BASE/driver && \
+    if [ "$CHIP" != "all" ]; \
+    then \
+        CHIPOPTION="--chip=$CHIP"; \
+    else \
+        CHIPOPTION=""; \
+    fi && \
+    chmod +x $TOOLKIT_PKG $KERNELS_PKG && \
+    ./$TOOLKIT_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all $CHIPOPTION && \
+    ./$KERNELS_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all && \
+    rm -f $TOOLKIT_PKG $KERNELS_PKG
+
+ENV GLOG_v=2 \
+    LD_LIBRARY_PATH=$TOOLKIT_PATH/lib64:$LD_LIBRARY_PATH \
+    TBE_IMPL_PATH=$TOOLKIT_PATH/opp/op_impl/built-in/ai_core/tbe \
+    PATH=$TOOLKIT_PATH/ccec_compiler/bin:$PATH \
+    ASCEND_OPP_PATH=$TOOLKIT_PATH/opp \
+    ASCEND_AICPU_PATH=$TOOLKIT_PATH
+
+ENV PYTHONPATH=$TBE_IMPL_PATH:$PYTHONPATH
+
+RUN rm -rf ./*
+
+SHELL ["/bin/bash", "-c"]
+RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc && \
+    . ~/.bashrc
+
+WORKDIR /deeplink
+RUN echo -e "diff --git a/impl/ascend_npu/CMakeLists.txt b/impl/ascend_npu/CMakeLists.txt\n\
+index e684c59..f1cd8d4 100755\n\
+--- a/impl/ascend_npu/CMakeLists.txt\n\
++++ b/impl/ascend_npu/CMakeLists.txt\n\
+@@ -14,6 +14,11 @@ FetchContent_Declare(op_plugin\n\
+ FetchContent_MakeAvailable(op_plugin)\n\
+ message(STATUS \"op-plugin download done\")\n\
+ \n\
++add_custom_target(patch_op_plugin_code\n\
++  COMMAND sed -i 's/GetOpApiLibHandler\(GetCustOpApiLibName\(\)\)/nullptr/' \${op_plugin_SOURCE_DIR}/op_plugin/utils/op_api_common.h\n\
++  BYPRODUCTS \${op_plugin_SOURCE_DIR}/op_plugin/utils/op_api_common.h\n\
++)\n\
++\n\
+ add_custom_target(op_plugin_gen\n\
+   COMMAND cd \${op_plugin_SOURCE_DIR} && bash ./gencode.sh 2.1 python\n\
+   BYPRODUCTS \${op_plugin_SOURCE_DIR}/op_plugin/OpInterface.h \${op_plugin_SOURCE_DIR}/op_plugin/OpInterface.cpp\n\
+@@ -253,7 +258,7 @@ endif()\n\
+ set(THIRD_PARTY_INCLUDE_DIRS \${CMAKE_CURRENT_SOURCE_DIR}/../third_party/half/include)\n\
+ \n\
+ add_library(\${DEVICEIMPL} SHARED \${IMPL_SRC})\n\
+-add_dependencies(\${DEVICEIMPL} op_plugin_gen)\n\
++add_dependencies(\${DEVICEIMPL} op_plugin_gen patch_op_plugin_code)\n\
+ set_target_properties(\${DEVICEIMPL} PROPERTIES SUFFIX \".so\")\n\
+ target_include_directories(\${DEVICEIMPL} PRIVATE \${ASCEND_DIR}/ascend-toolkit/latest/include/aclnn)\n\
+ target_include_directories(\${DEVICEIMPL} SYSTEM PUBLIC \${THIRD_PARTY_INCLUDE_DIRS})\n" > /deeplink/warning.patch
+
+# deeplink
+RUN git clone https://github.com/DeepLink-org/deeplink.framework.git && \
+    cd deeplink.framework/dipu && \
+    git checkout ${DEEPLINK_TAG_OR_COMMIT} && \
+    git submodule update --init --recursive && \
+    git -C ./third_party/DIOPI apply /deeplink/warning.patch && \
+    DIPU_DEVICE=ascend python setup.py develop && \
+    rm -rf /root/.cache/pip
+
+# deeplink_ext
+RUN git clone https://github.com/DeepLink-org/DeepLinkExt.git && \
+    cd DeepLinkExt && \
+    git checkout ${DEEPLINKEXT_TAG_OR_COMMIT} && \
+    DIPU_REPO=/deeplink/deeplink.framework/dipu DIPU_ROOT=${DIPU_REPO}/torch_dipu \
+    DIOPI_PATH=${DIPU_REPO}/third_party/DIOPI/proto \
+    VENDOR_INCLUDE_DIRS=/usr/local/Ascend/ascend-toolkit/latest/include \
+    pip install -vv --no-build-isolation -e . && \
+    rm -rf /root/.cache/pip
+
+# lmdeploy
+WORKDIR /workspace
+RUN git clone https://github.com/InternLM/lmdeploy.git && \
+    cd lmdeploy && \
+    git checkout ${LMDEPLOY_TAG_OR_COMMIT} && \
+    sed -i '/triton/d' requirements/runtime.txt && \
+    pip install -vv --no-build-isolation -e . && \
+    rm -rf /root/.cache/pip
diff --git a/docs/en/advance/debug_turbomind.md b/docs/en/advance/debug_turbomind.md
index c4c7b32f7f..91733ce2a5 100644
--- a/docs/en/advance/debug_turbomind.md
+++ b/docs/en/advance/debug_turbomind.md
@@ -4,7 +4,7 @@ Turbomind is implemented in C++, which is not as easy to debug as Python. This d
 
 ## Prerequisite
 
-First, complete the local compilation according to the commands in [Install from source](../installation.md).
+First, complete the local compilation according to the commands in [Install from source](../get_started/installation.md).
 
 ## Configure Python debug environment
 
diff --git a/docs/en/advance/structed_output.md b/docs/en/advance/structed_output.md
new file mode 100644
index 0000000000..b4b3cd7dfd
--- /dev/null
+++ b/docs/en/advance/structed_output.md
@@ -0,0 +1,106 @@
+# Structured output
+
+Currently, only the Pytorch backend has this capability. Therefore, whether you are using the pipeline or the api_server, please specify the use of the Pytorch backend.
+
+## pipeline
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.messages import GenerationConfig, PytorchEngineConfig
+
+model = 'internlm/internlm2-chat-1_8b'
+guide = {
+    'type': 'object',
+    'properties': {
+        'name': {
+            'type': 'string'
+        },
+        'skills': {
+            'type': 'array',
+            'items': {
+                'type': 'string',
+                'maxLength': 10
+            },
+            'minItems': 3
+        },
+        'work history': {
+            'type': 'array',
+            'items': {
+                'type': 'object',
+                'properties': {
+                    'company': {
+                        'type': 'string'
+                    },
+                    'duration': {
+                        'type': 'string'
+                    }
+                },
+                'required': ['company']
+            }
+        }
+    },
+    'required': ['name', 'skills', 'work history']
+}
+pipe = pipeline(model, backend_config=PytorchEngineConfig(), log_level='INFO')
+gen_config = GenerationConfig(
+    response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide)))
+response = pipe(['Make a self introduction please.'], gen_config=gen_config)
+print(response)
+```
+
+## api_server
+
+Firstly, start the api_server service for the InternLM2 model.
+
+```shell
+lmdeploy serve api_server internlm/internlm2-chat-1_8b --backend pytorch
+```
+
+The client can test using OpenAI’s python package: The output result is a response in JSON format.
+
+```python
+from openai import OpenAI
+guide = {
+    'type': 'object',
+    'properties': {
+        'name': {
+            'type': 'string'
+        },
+        'skills': {
+            'type': 'array',
+            'items': {
+                'type': 'string',
+                'maxLength': 10
+            },
+            'minItems': 3
+        },
+        'work history': {
+            'type': 'array',
+            'items': {
+                'type': 'object',
+                'properties': {
+                    'company': {
+                        'type': 'string'
+                    },
+                    'duration': {
+                        'type': 'string'
+                    }
+                },
+                'required': ['company']
+            }
+        }
+    },
+    'required': ['name', 'skills', 'work history']
+}
+response_format=dict(type='json_schema',  json_schema=dict(name='test',schema=guide))
+messages = [{'role': 'user', 'content': 'Make a self-introduction please.'}]
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    response_format=response_format,
+    top_p=0.8)
+print(response)
+```
diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md
index 574b9ed506..04468bea2f 100644
--- a/docs/en/benchmark/evaluate_with_opencompass.md
+++ b/docs/en/benchmark/evaluate_with_opencompass.md
@@ -8,7 +8,7 @@ In this part, we are going to setup the environment for evaluation.
 
 ### Install lmdeploy
 
-Please follow the [installation guide](../installation.md) to install lmdeploy.
+Please follow the [installation guide](../get_started/installation.md) to install lmdeploy.
 
 ### Install OpenCompass
 
diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
new file mode 100644
index 0000000000..eeb1371ea0
--- /dev/null
+++ b/docs/en/get_started/ascend/get_started.md
@@ -0,0 +1,117 @@
+# Get Started with Huawei Ascend (Atlas 800T A2）
+
+The usage of lmdeploy on a Huawei Ascend device is almost the same as its usage on CUDA with PytorchEngine in lmdeploy.
+Please read the original [Get Started](../get_started.md) guide before reading this tutorial.
+
+## Installation
+
+### Environment Preparation
+
+#### Drivers and Firmware
+
+The host machine needs to install the Huawei driver and firmware version 23.0.3, refer to
+[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha).
+
+#### CANN
+
+File `docker/Dockerfile_aarch64_ascend` does not provide Ascend CANN installation package, users need to download the CANN (version 8.0.RC3.alpha001) software packages from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001) themselves. And place the Ascend-cann-kernels-910b\*.run and Ascend-cann-toolkit\*-aarch64.run under the directory where the docker build command is executed.
+
+#### Docker
+
+Building the aarch64_ascend image requires Docker >= 18.03
+
+#### Reference Command for Building the Image
+
+The following reference command for building the image is based on the lmdeploy source code root directory as the current directory, and the CANN-related installation packages are also placed under this directory.
+
+```bash
+DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:v0.1 \
+    -f docker/Dockerfile_aarch64_ascend .
+```
+
+This image will install lmdeploy to `/workspace/lmdeploy` directory using `pip install --no-build-isolation -e .` command.
+
+#### Using the Image
+
+You can refer to the [documentation](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/clusterscheduling/dockerruntimeug/dlruntime_ug_013.html)
+for usage. It is recommended to install Ascend Docker Runtime.
+Here is an example of starting container for Huawei Ascend device with Ascend Docker Runtime installed:
+
+```bash
+docker run -e ASCEND_VISIBLE_DEVICES=0 --net host -td --entry-point bash --name lmdeploy_ascend_demo \
+    lmdeploy-aarch64-ascend:v0.1  # docker_image_sha_or_name
+```
+
+#### Pip install
+
+If you have lmdeploy installed and all Huawei environments are ready, you can run the following command to enable lmdeploy to run on Huawei Ascend devices. (Not necessary if you use the Docker image.)
+
+```bash
+pip install dlinfer-ascend
+```
+
+## Offline batch inference
+
+### LLM inference
+
+Set `device_type="ascend"`  in the `PytorchEngineConfig`:
+
+```python
+from lmdeploy import pipeline
+from lmdeploy import PytorchEngineConfig
+if __name__ == "__main__":
+    pipe = pipeline("internlm/internlm2_5-7b-chat",
+                    backend_config = PytorchEngineConfig(tp=1, device_type="ascend"))
+    question = ["Shanghai is", "Please introduce China", "How are you?"]
+    response = pipe(question)
+    print(response)
+```
+
+### VLM inference
+
+Set `device_type="ascend"` in the `PytorchEngineConfig`:
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+from lmdeploy.vl import load_image
+if __name__ == "__main__":
+    pipe = pipeline('OpenGVLab/InternVL2-2B',
+                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
+    image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+    response = pipe(('describe this image', image))
+    print(response)
+```
+
+## Online serving
+
+### Serve a LLM model
+
+Add `--device ascend` in the serve command.
+
+```bash
+lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat
+```
+
+### Serve a VLM model
+
+Add `--device ascend` in the serve command
+
+```bash
+lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B
+```
+
+## Inference with Command line Interface
+
+Add `--device ascend` in the serve command.
+
+```bash
+lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend
+```
+
+Run the following commands to launch lmdeploy chatting after starting container:
+
+```bash
+docker exec -it lmdeploy_ascend_demo \
+    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat"
+```
diff --git a/docs/en/get_started.md b/docs/en/get_started/get_started.md
similarity index 94%
rename from docs/en/get_started.md
rename to docs/en/get_started/get_started.md
index 311980536f..8650858d12 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started/get_started.md
@@ -1,6 +1,6 @@
 # Quick Start
 
-This tutorial shows the usage of LMDeploy on:
+This tutorial shows the usage of LMDeploy on CUDA platform:
 
 - Offline inference of LLM model and VLM model
 - Serve a LLM or VLM model by the OpenAI compatible server
@@ -19,7 +19,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 print(response)
 ```
 
-When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](supported_models/supported_models.md), with the TurboMind Engine taking precedence by default.
+When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](../supported_models/supported_models.md), with the TurboMind Engine taking precedence by default.
 
 However, you have the option to manually select an engine. For instance,
 
@@ -74,7 +74,7 @@ response = pipe(prompts,
 
 In the `GenerationConfig`, `top_k=1` or `temperature=0.0` indicates greedy search.
 
-For more information about pipeline, please read the [detailed tutorial](llm/pipeline.md)
+For more information about pipeline, please read the [detailed tutorial](../llm/pipeline.md)
 
 ### VLM inference
 
@@ -110,7 +110,7 @@ print(response)
 
 However, the larger the image batch size, the greater risk of an OOM error, because the LLM component within the VLM model pre-allocates a massive amount of memory in advance.
 
-We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models matrix](./supported_models/supported_models.md).
+We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models matrix](../supported_models/supported_models.md).
 Additionally, follow the instructions in [LLM Inference](#llm-inference) section to reduce the values of memory-related parameters
 
 ## Serving
@@ -147,7 +147,7 @@ response = client.chat.completions.create(
 print(response)
 ```
 
-We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](./llm/api_server.md), [function calls](llm/api_server_tools.md) and other topics
+We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](../llm/api_server.md), [function calls](../llm/api_server_tools.md) and other topics
 
 ### Serve a VLM model
 
diff --git a/docs/en/get_started/index.rst b/docs/en/get_started/index.rst
new file mode 100644
index 0000000000..4343ee9ab1
--- /dev/null
+++ b/docs/en/get_started/index.rst
@@ -0,0 +1,8 @@
+On Other Platforms
+=================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: NPU(Huawei)
+
+   ascend/get_started.md
diff --git a/docs/en/installation.md b/docs/en/get_started/installation.md
similarity index 100%
rename from docs/en/installation.md
rename to docs/en/get_started/installation.md
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 5f5f3420dc..5d49e01c86 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -38,11 +38,12 @@ Documentation
 
 .. _get_started:
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Get Started
 
-   installation.md
-   get_started.md
+   get_started/installation.md
+   get_started/get_started.md
+   get_started/index.rst
 
 .. _supported_models:
 .. toctree::
@@ -101,6 +102,7 @@ Documentation
    advance/long_context.md
    advance/chat_template.md
    advance/debug_turbomind.md
+   advance/structed_output.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/en/llm/qos.md b/docs/en/llm/qos.md
deleted file mode 100644
index 221679e775..0000000000
--- a/docs/en/llm/qos.md
+++ /dev/null
@@ -1,219 +0,0 @@
-## LMDeploy-QoS Introduce and Usage
-
-### Background
-
-With the rise of Large Language Model (LLM) and Artificial General Intelligence (AGI), numerous inference frameworks have emerged. These frameworks deliver scalable and high-performance services by serving online workloads with language models. However, these workloads often come from multiple user groups, exhibiting rapid changes in workload patterns within short periods. Many inference frameworks struggle to meet the demands of such multi-tenancy traffic patterns and fail to effectively shape user behaviors. Therefore, we believe that systematically considering these issues in LLM inference framework is both valuable and necessary.
-
-### User Categorizations for Multi-tenancy Handling
-
-LMDeploy-QoS is part of LMDeploy, offering a range of multi-tenancy functionalities. It requires users to tag their inference requests with appropriate user identifications (user_id in configuration or codebase). The system operates based on a dictionary-like configuration that serves as a multi-tenancy policy. In this configuration, users are mapped to different classes, known as "user groups", each configured with a ratio value. Our multi-tenancy strategy reads this configuration and schedules user inference requests according to class priority and the difference between the predefined ratio and real-time allocation ratio. Extensive testing shows that LMDeploy-QoS significantly enhances LLM serving reliability and GPU resource utilization for real-world large language model inference workloads.
-
-We categorize LMDeploy users into four groups:
-
-- Platinum
-- Gold
-- Silver
-- Bronze
-
-Based on our experiences in delivering LLM services, we can map the following four types of users to these user groups:
-
-- Platinum: VIP or administrative users. Examples include service inspectors or product demo presenters who require uninterrupted online services. Their workloads are typically at a low frequency and require limited resources.
-
-- Gold: Contracted business user groups requiring specific quantities of reliable services. For instance, Company A signs a contract with the LLM service provider to secure X requests/sec service capability with Z% availability for its employees at the cost of Y million dollars per year.
-
-- Silver: The vast majority of users fall under this category. Most trial or monthly subscribed users are included in this group. They need a relatively small quantity of services, but their user experiences significantly affect the LLM service reputation.
-
-- Bronze: Heavy users who pay minimal fees to LLM providers.
-
-The above user group categorization is intended for guidance rather than as a recommendation for all LMDeploy users, as it may not be suitable for all LLM service providers. Users can develop their own method of categorizing users based on their observations of daily workloads.
-
-Next, we will discuss how LMDeploy schedules requests based on these categorizations.
-
-### Multi-tenancy Strategies
-
-#### Strategy 1: prioritized scheduling between groups
-
-This strategy works as simple as its title suggests.
-
-User groups are introduced for this strategy, with users in each group to be specified. Recommended user groups are as follows:
-
-- Platinum
-- Gold
-- Silver
-- Bronze
-
-The priority of each group decreases sequentially. Requests with higher priority are always given precedence for inference. Be noted that the scheduling is performed at the time of request reception, so lower-priority requests will not be withdrawn from the GPU if they are already under inference.
-
-The below diagram shows how the prioritization works. As you can see, the platinum request is reprioritized and moved to the queue head.
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/9d63f081-7168-4c74-8456-24f0a4b41649)
-
-#### Strategy 2: proportionally rated scheduling with a pre-defined ratio within user group
-
-This strategy works only within the user group. We introduce a within-group user quota configuration table. This table defines users' "ideal share ratio" with a sum value of 100% GPU resource. Each "user" appears in the list as a user_id, and a user can only belong to one user group. Requests from different users will be scheduled according to each user's "ideal share ratio". To be specific, users with their real-time usage ratio lower than their quota ratio will have priority over users whose real-time usage ratio is higher than their quota ratio. It is worth noting that the scheduling only considers users in the request queue, ignoring any absent users from the configuration table.
-
-The below diagram shows a typical example of how this strategy works.
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/3e1d7135-6b11-4998-89a1-b72af6c962c3)
-
-#### Strategy 3: a combination strategy of 1 and 2
-
-We can call it a hybrid strategy. The way we hybrid these 2 strategies is fairly simple: we adopt strategy 1 in between user groups, and adopt strategy 2 within a user group. So users belonging to different groups with different priorities will only obey strategy 1 to determine their privilege in resource allocation. That is, when both strategies are applied, the first strategy will overpower the second. When it comes to a situation that no cross-group requests are waiting for serving, the within-group strategy 2 comes into play.
-
-Below is a diagram showing it.
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/e335f976-ff15-48db-b1ff-abf1c3327d6e)
-
-To be noted, there could be other ways of hybrid strategies 1 & 2, and this doc only introduces one method that works well in our scenario. Considering that prioritization and pro-rated sharing are obviously conflicting strategies, there is no easy way to mix them to work within a single dimension.
-
-### A Sample QoS Configuration
-
-The configuration will be specified by the `--qos-config-path` flag, and will be loaded by program upon startup.
-
-```json
-{
-    "enable_user_qos": true,
-    "user_groups": [
-        "Platinum",
-        "Gold",
-        "Silver",
-        "Bronze"
-    ],
-    "user_group_map": {
-        "Platinum": [
-            {
-                "id": "user_id0",
-                "quota_pct": 100
-            },
-            {
-                "id": "default",
-                "quota_pct": 0
-            }
-        ],
-        "Gold": [
-            {
-                "id": "user_id1",
-                "quota_pct": 50
-            },
-            {
-                "id": "user_id2",
-                "quota_pct": 50
-            }
-        ],
-        "Silver": [
-            {
-                "id": "user_id3",
-                "quota_pct": 5
-            },
-            {
-                "id": "default",
-                "quota_pct": 95
-            }
-        ],
-        "Bronze": [
-            {
-                "id": "user_id4",
-                "quota_pct": 30
-            },
-            {
-                "id": "user_id5",
-                "quota_pct": 30
-            },
-            {
-                "id": "user_id6",
-                "quota_pct": 40
-            },
-            {
-                "id": "default",
-                "quota_pct": 0
-            }
-        ]
-    }
-}
-```
-
-### How to perform inference job with Lmdeploy-QoS aware
-
-We provide the code link below to show how to call infer requests with multi-tenancy strategy awarded. What the qos related argument appears as in http body：
-
-/v1/chat/interactive_qos
-
-```bash
-curl -X POST http://localhost/v1/chat/interactive_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "prompt": "Hello,Hello",
-  "session_id": -1,
-  "interactive_mode": false,
-  "stream": false,
-  "stop": false,
-  "request_output_len": 512,
-  "top_p": 0.8,
-  "top_k": 40,
-  "temperature": 0.8,
-  "repetition_penalty": 1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-/v1/chat/completions_qos
-
-```bash
-curl -X POST http://localhost/v1/chat/completions_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "internlm-chat-7b",
-  "messages": "Hello,Hello",
-  "temperature": 0.7,
-  "top_p": 1,
-  "n": 1,
-  "max_tokens": 512,
-  "stop": false,
-  "stream": false,
-  "presence_penalty": 0,
-  "frequency_penalty": 0,
-  "repetition_penalty": 1,
-  "session_id": -1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-/v1/completions_qos
-
-```bash
-curl -X POST http://localhost/v1/completions_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "internlm-chat-7b",
-  "prompt": "Hello,Hello",
-  "suffix": "string",
-  "temperature": 0.7,
-  "n": 1,
-  "max_tokens": 16,
-  "stop": "string",
-  "stream": false,
-  "top_p": 1,
-  "repetition_penalty": 1,
-  "session_id": -1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-### File Configuration Modification
-
-The template of the configuration file is located at: `lmdeploy/server/qos_engine/qos_config.json.template`. Add the necessary users based on actual requirements, ensure correct priority assignment, and set appropriate quota values.
-
-### Passing Configuration Parameters
-
-Upon starting the api_server, pass the configuration file and its path using the `--qos-config-path` flag. An example is illustrated below:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm-chat-7b --server-port 8000 --qos-config-path lmdeploy/serve/qos_engine/qos_config.json.template
-```
-
-### Contributor
-
-[Eric](https://github.com/rhinouser0), [sallyjunjun](https://github.com/sallyjunjun), [sfireworks](https://github.com/sfireworks), [Dofgal](https://github.com/Dofgal), [shadow](https://github.com/awslshadowstar)
diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md
index d2114e574c..6673e3105b 100644
--- a/docs/en/multi_modal/cogvlm.md
+++ b/docs/en/multi_modal/cogvlm.md
@@ -17,7 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https:
 pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121
 ```
 
-Install LMDeploy by following the [installation guide](../installation.md)
+Install LMDeploy by following the [installation guide](../get_started/installation.md)
 
 ### Prepare
 
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index 8f0f81387d..efa2b30a26 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -13,7 +13,7 @@ The next chapter demonstrates how to deploy an InternVL model using LMDeploy, wi
 
 ## Installation
 
-Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternVL2 needs
+Please install LMDeploy by following the [installation guide](../get_started/installation.md), and install other packages that InternVL2 needs
 
 ```shell
 pip install timm
diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
index 9283fc1435..15774de7e7 100644
--- a/docs/en/multi_modal/minicpmv.md
+++ b/docs/en/multi_modal/minicpmv.md
@@ -11,7 +11,7 @@ The next chapter demonstrates how to deploy an MiniCPM-V model using LMDeploy, w
 
 ## Installation
 
-Please install LMDeploy by following the [installation guide](../installation.md).
+Please install LMDeploy by following the [installation guide](../get_started/installation.md).
 
 ## Offline inference
 
diff --git a/docs/en/multi_modal/phi3.md b/docs/en/multi_modal/phi3.md
index a801618b35..a7ad0237e2 100644
--- a/docs/en/multi_modal/phi3.md
+++ b/docs/en/multi_modal/phi3.md
@@ -13,7 +13,7 @@ The next chapter demonstrates how to deploy an Phi-3 model using LMDeploy, with
 
 ## Installation
 
-Please install LMDeploy by following the [installation guide](../installation.md) and install the dependency [Flash-Attention](https://github.com/Dao-AILab/flash-attention)
+Please install LMDeploy by following the [installation guide](../get_started/installation.md) and install the dependency [Flash-Attention](https://github.com/Dao-AILab/flash-attention)
 
 ```shell
 # It is recommended to find the whl package that matches the environment from the releases on https://github.com/Dao-AILab/flash-attention.
diff --git a/docs/en/multi_modal/xcomposer2d5.md b/docs/en/multi_modal/xcomposer2d5.md
index d6883c0023..2f56b65ea1 100644
--- a/docs/en/multi_modal/xcomposer2d5.md
+++ b/docs/en/multi_modal/xcomposer2d5.md
@@ -8,7 +8,7 @@
 
 ### Installation
 
-Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternLM-XComposer-2.5 needs
+Please install LMDeploy by following the [installation guide](../get_started/installation.md), and install other packages that InternLM-XComposer-2.5 needs
 
 ```shell
 pip install decord
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index a9e4e5e12e..3adaf7a750 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -9,7 +9,7 @@ The following NVIDIA GPUs are available for AWQ/GPTQ INT4 inference:
 - Ampere(sm80,sm86): 30 series, A10, A16, A30, A100
 - Ada Lovelace(sm89): 40 series
 
-Before proceeding with the quantization and inference, please ensure that lmdeploy is installed by following the [installation guide](../installation.md)
+Before proceeding with the quantization and inference, please ensure that lmdeploy is installed by following the [installation guide](../get_started/installation.md)
 
 The remainder of this article is structured into the following sections:
 
diff --git a/docs/zh_cn/advance/debug_turbomind.md b/docs/zh_cn/advance/debug_turbomind.md
index cb95c6ef4d..3c3b75421d 100644
--- a/docs/zh_cn/advance/debug_turbomind.md
+++ b/docs/zh_cn/advance/debug_turbomind.md
@@ -4,7 +4,7 @@ Turbomind 使用 C++ 实现，不像 Python 一样易于调试。该文档提供
 
 ## 前置工作
 
-首先，根据构建[命令](../installation.md)完成源码编译和安装。
+首先，根据构建[命令](../get_started/installation.md)完成源码编译和安装。
 
 ## 配置 Python 调试环境
 
diff --git a/docs/zh_cn/advance/structed_output.md b/docs/zh_cn/advance/structed_output.md
new file mode 100644
index 0000000000..9f8e9c6cc4
--- /dev/null
+++ b/docs/zh_cn/advance/structed_output.md
@@ -0,0 +1,108 @@
+# 结构化输出
+
+目前只有 Pytorch 后端具有该能力。所以无论是使用 pipline 还是使用 api_server，请指定使用 pytorch 后端。
+
+## pipeline
+
+```python
+from lmdeploy import pipeline
+from lmdeploy.messages import GenerationConfig, PytorchEngineConfig
+
+model = 'internlm/internlm2-chat-1_8b'
+guide = {
+    'type': 'object',
+    'properties': {
+        'name': {
+            'type': 'string'
+        },
+        'skills': {
+            'type': 'array',
+            'items': {
+                'type': 'string',
+                'maxLength': 10
+            },
+            'minItems': 3
+        },
+        'work history': {
+            'type': 'array',
+            'items': {
+                'type': 'object',
+                'properties': {
+                    'company': {
+                        'type': 'string'
+                    },
+                    'duration': {
+                        'type': 'string'
+                    }
+                },
+                'required': ['company']
+            }
+        }
+    },
+    'required': ['name', 'skills', 'work history']
+}
+pipe = pipeline(model, backend_config=PytorchEngineConfig(), log_level='INFO')
+gen_config = GenerationConfig(
+    response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide)))
+response = pipe(['Make a self introduction please.'], gen_config=gen_config)
+print(response)
+```
+
+## api_server
+
+首先，先启动 InternLM2 模型的 api_server 服务。
+
+```shell
+lmdeploy serve api_server internlm/internlm2-chat-1_8b --backend pytorch
+```
+
+客户端可以使用 OpenAI 的 python 包进行测试：
+
+```python
+from openai import OpenAI
+guide = {
+    'type': 'object',
+    'properties': {
+        'name': {
+            'type': 'string'
+        },
+        'skills': {
+            'type': 'array',
+            'items': {
+                'type': 'string',
+                'maxLength': 10
+            },
+            'minItems': 3
+        },
+        'work history': {
+            'type': 'array',
+            'items': {
+                'type': 'object',
+                'properties': {
+                    'company': {
+                        'type': 'string'
+                    },
+                    'duration': {
+                        'type': 'string'
+                    }
+                },
+                'required': ['company']
+            }
+        }
+    },
+    'required': ['name', 'skills', 'work history']
+}
+response_format=dict(type='json_schema',  json_schema=dict(name='test',schema=guide))
+messages = [{'role': 'user', 'content': 'Make a self-introduction please.'}]
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=messages,
+    temperature=0.8,
+    response_format=response_format,
+    top_p=0.8)
+print(response)
+```
+
+输出结果是一个 json 格式的回答。
diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
index 94ba5326bb..f4480718a8 100644
--- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md
+++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
@@ -8,7 +8,7 @@ LMDeploy设计了TurboMind推理引擎用来加速大模型推理，其推理精
 
 ### 安装 lmdeploy
 
-请参考[安装指南](../installation.md)安装 lmdeploy
+请参考[安装指南](../get_started/installation.md)安装 lmdeploy
 
 ### 安装 OpenCompass
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
new file mode 100644
index 0000000000..01626e49d6
--- /dev/null
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -0,0 +1,119 @@
+# 华为昇腾(Atlas 800T A2）
+
+我们采用了LMDeploy中的PytorchEngine后端支持了华为昇腾设备，
+所以在华为昇腾上使用lmdeploy的方法与在英伟达GPU上使用PytorchEngine后端的使用方法几乎相同。
+在阅读本教程之前，请先阅读原版的[快速开始](../get_started.md)。
+
+## 安装
+
+### 环境准备
+
+#### Drivers和Firmware
+
+Host需要安装华为驱动程序和固件版本23.0.3，请参考
+[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html)
+和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha)。
+
+#### CANN
+
+`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包，用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001)下载CANN(8.0.RC3.alpha001)软件包。
+并将Ascend-cann-kernels-910b\*.run 和 Ascend-cann-toolkit\*-aarch64.run 放在执行`docker build`命令的目录下。
+
+#### Docker
+
+构建aarch64_ascend镜像需要Docker>=18.03
+
+#### 构建镜像的命令
+
+请在lmdeploy源代码根目录下执行以下镜像构建命令，CANN相关的安装包也放在此目录下。
+
+```bash
+DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:v0.1 \
+    -f docker/Dockerfile_aarch64_ascend .
+```
+
+这个镜像将使用`pip install --no-build-isolation -e .`命令将lmdeploy安装到/workspace/lmdeploy目录。
+
+#### 镜像的使用
+
+关于镜像的使用方式，请参考这篇[文档](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/clusterscheduling/dockerruntimeug/dlruntime_ug_013.html)。
+并且在使用镜像前安装Ascend Docker Runtime。
+以下是在安装了 Ascend Docker Runtime 的情况下，启动用于华为昇腾设备的容器的示例：
+
+```bash
+docker run -e ASCEND_VISIBLE_DEVICES=0 --net host -td --entry-point bash --name lmdeploy_ascend_demo \
+    lmdeploy-aarch64-ascend:v0.1  # docker_image_sha_or_name
+```
+
+#### 使用Pip安装
+
+如果您已经安装了lmdeploy并且所有华为环境都已准备好，您可以运行以下命令使lmdeploy能够在华为昇腾设备上运行。(如果使用Docker镜像则不需要)
+
+```bash
+pip install dlinfer-ascend
+```
+
+## 离线批处理
+
+### LLM 推理
+
+将`device_type="ascend"`加入`PytorchEngineConfig`的参数中。
+
+```python
+from lmdeploy import pipeline
+from lmdeploy import PytorchEngineConfig
+if __name__ == "__main__":
+    pipe = pipeline("internlm/internlm2_5-7b-chat",
+                    backend_config = PytorchEngineConfig(tp=1, device_type="ascend"))
+    question = ["Shanghai is", "Please introduce China", "How are you?"]
+    response = pipe(question)
+    print(response)
+```
+
+### VLM 推理
+
+将`device_type="ascend"`加入`PytorchEngineConfig`的参数中。
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+from lmdeploy.vl import load_image
+if __name__ == "__main__":
+    pipe = pipeline('OpenGVLab/InternVL2-2B',
+                    backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
+    image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+    response = pipe(('describe this image', image))
+    print(response)
+```
+
+## 在线服务
+
+### LLM 模型服务
+
+将`--device ascend`加入到服务启动命令中。
+
+```bash
+lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat
+```
+
+### VLM 模型服务
+
+将`--device ascend`加入到服务启动命令中。
+
+```bash
+lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B
+```
+
+## 使用命令行与LLM模型对话
+
+将`--device ascend`加入到服务启动命令中。
+
+```bash
+lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend
+```
+
+也可以运行以下命令使启动容器后开启lmdeploy聊天
+
+```bash
+docker exec -it lmdeploy_ascend_demo \
+    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat"
+```
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started/get_started.md
similarity index 95%
rename from docs/zh_cn/get_started.md
rename to docs/zh_cn/get_started/get_started.md
index 5649397a8f..51d5f0ff81 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started/get_started.md
@@ -21,7 +21,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"])
 print(response)
 ```
 
-在构造 `pipeline` 时，如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理，LMDeploy 将根据[它们各自的能力](supported_models/supported_models.md)自动分配一个，默认优先使用 TurboMind 引擎。
+在构造 `pipeline` 时，如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理，LMDeploy 将根据[它们各自的能力](../supported_models/supported_models.md)自动分配一个，默认优先使用 TurboMind 引擎。
 
 然而，你可以选择手动选择一个引擎。例如，
 
@@ -73,7 +73,7 @@ response = pipe(prompts,
 
 在 `GenerationConfig` 中，`top_k=1` 或 `temperature=0.0` 表示贪心搜索。
 
-有关 pipeline 的更多信息，请参考[这里](llm/pipeline.md)
+有关 pipeline 的更多信息，请参考[这里](../llm/pipeline.md)
 
 ### VLM 推理
 
@@ -144,7 +144,7 @@ response = client.chat.completions.create(
 print(response)
 ```
 
-我们鼓励你参考详细指南，了解关于[使用 Docker 部署服务](./llm/api_server.md)、[工具调用](llm/api_server_tools.md)和其他更多功能的信息。
+我们鼓励你参考详细指南，了解关于[使用 Docker 部署服务](../llm/api_server.md)、[工具调用](../llm/api_server_tools.md)和其他更多功能的信息。
 
 ### VLM 模型服务
 
diff --git a/docs/zh_cn/get_started/index.rst b/docs/zh_cn/get_started/index.rst
new file mode 100644
index 0000000000..35affc13ce
--- /dev/null
+++ b/docs/zh_cn/get_started/index.rst
@@ -0,0 +1,8 @@
+其他软硬件平台
+=================================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: NPU(Huawei)
+
+   ascend/get_started.md
diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/get_started/installation.md
similarity index 100%
rename from docs/zh_cn/installation.md
rename to docs/zh_cn/get_started/installation.md
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 262f970ce0..018a00487f 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -41,8 +41,9 @@ LMDeploy 工具箱提供以下核心功能：
    :maxdepth: 2
    :caption: 快速上手
 
-   installation.md
-   get_started.md
+   get_started/installation.md
+   get_started/get_started.md
+   get_started/index.rst
 
 .. _支持的模型:
 .. toctree::
@@ -102,6 +103,7 @@ LMDeploy 工具箱提供以下核心功能：
    advance/long_context.md
    advance/chat_template.md
    advance/debug_turbomind.md
+   advance/structed_output.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/zh_cn/llm/qos.md b/docs/zh_cn/llm/qos.md
deleted file mode 100644
index 10b717fd74..0000000000
--- a/docs/zh_cn/llm/qos.md
+++ /dev/null
@@ -1,225 +0,0 @@
-## LMDeploy-QoS 介绍与用法
-
-### 背景
-
-在过去一段时间，推理框架伴随着LLM和AGI出现。许多推理框架为语言模型提供可扩展和高性能的在线工作负载服务。它们的工作负载通常涉及多个用户群体，而且工作负载在短时间内快速变化。许多推理框架在满足这些多租户流量模式的要求方面存在困难，而且未能很好的规范约束用户的行为，所以我们认为在LLM推理框架考虑多用户负载均衡是很有必要的。
-
-### 多租户处理的用户分类
-
-LMDeploy-QoS与LMDeploy 提供一系列多租户功能。它要求用户使用适当的用户标识(配置文件或代码库中的user_id)标记其推理请求。它是基于字典的配置作为多租户策略。在这个配置中，用户被映射到不同“用户组”中，并配备一个使用配额。我们的多租户策略可以读取配置，并根据其用户组的优先级和预定义配额与实时分配比率之间的差异安排用户推理请求的调度。经过完备的测试，我们的LMDeploy-QoS模块极大地提高了LLM的服务可靠性并提升了大型语言模型推理工作的GPU资源利用率。
-
-LMDeploy将用户分为4组：
-
-- 白金（Platinum）
-- 金（Gold）
-- 银（Silver）
-- 青铜（Bronze）
-
-根据我们在提供LLM服务方面的使用经验，我们可以将以下4种类型的用户映射到这些用户组中：
-
-- Platinum : VIP用户或管理员用户。包括需要不间断使用的的服务开发人员或演示人员。他们的工作负载频率低，对推理工作的资源需求也不高。
-
-- Gold : 签署定期服务的高级用户，他们需要可衡量的可靠服务。例如，某个公司A与LLM服务提供商签订了合同，购买了每秒X个请求的服务能力，可用性为Z%，供A公司员工使用，年付Y百万美元。
-
-- Silver : 绝大多数用户。大多数试用或每月订阅的用户被归类为此类别。他们需要相对较少的服务，但他们的用户体验对于LLM服务的声誉也很重要。
-
-- Bronze : 支付很少费用给LLM提供商的重度用户。
-
-以上引入用户组分类的目的是为了提供指导，而不是为所有LMDeploy用户提供建议，因为这并不一定适用于所有LLM业务提供商。管理员可以对用户的日常负载进行统计，自行决定如何对用户进行分类。
-
-接下来让我们讨论一下LMDeploy如何根据这些分类进行分配请求。
-
-### 多租户策略
-
-#### 策略 1: 用户组之间的优先级调度
-
-我们引入“用户组”概念。由模块使用者来定义哪些用户到用户组的映射（可以理解为 uid 到用户组的映射）。推荐用户组为4组如下：
-
-- Platinum
-- Gold
-- Silver
-- Bronze
-
-四个用户组之间的优先级顺序是严格的 Platinum > Gold > Silver > Bronze 。当系统繁忙的时候，我们会优先执行排名靠前的请求。
-
-下面的图表显示了优先级处理的工作原理。您可以看到 Platinum 请求已被重新设置优先级并移至队列头部。
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/9d63f081-7168-4c74-8456-24f0a4b41649)
-
-#### 策略 2: 用户组内均摊与软隔离
-
-这个策略仅适用于用户组内部。我们引入了一个用户组内的用户配额配置表。该表定义了用户在 100% GPU 资源中的 “理想份额比例”。每个 “用户” 在列表中以 user_id 的形式出现，并且一个用户只能属于一个用户组。低于配额表上额定值的用户会比高于额定值的用户拥有更高的优先级获得被释放资源而进行更多的推理，直到双方使用量趋近于原始配额比例。此处调度只考虑请求队列中的用户，忽略没有出现在请求队列中的已配置用户。
-
-以下图表展示了这种策略的典型示例。
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/3e1d7135-6b11-4998-89a1-b72af6c962c3)
-
-#### 策略3：混合机制
-
-是指在一个系统中优先级+均摊/隔离同时开启。执行顺序是先用户组间优先级，再在组内做均摊/隔离实现。这里略去时序图描写。需要注意的是，用户组间的优先级可以压倒性覆盖组内的决策。例如，当低优先级内部的两个用户互相之间有请求顺序调度时，高优先级的请求一旦抵达，将会覆盖所有低优先级的分配逻辑而有限执行高优任务。
-
-![](https://github.com/InternLM/lmdeploy/assets/52888924/e335f976-ff15-48db-b1ff-abf1c3327d6e)
-
-需要注意的是，混合机制可能有其他方法，本文档只介绍了一种在我们场景下有效的方法。其他混合方法需要考虑到优先级和按比例共享明显是相互冲突的策略，因此没有简单的方法将它们混合在单一维度内工作。
-
-### QoS 配置项模板
-
-配置文件通过启动参数`--qos-config-path`指定，并由程序在启动时加载。
-
-配置会和lmdeploy启动脚本等文件放置在一起。配置内容包含：
-
-1. QoS的启用开关，设置为True时后续的QoS和用户相关配置才会生效，设置为False后续配置不会生效；
-
-2. user_groups 是一个列表，包含了多种不同的组间优先级；
-
-3. user_group_map 的映射配置，包含了用户组优先级，组内用户id以及每个用户组内用户的配额分配。
-
-配置项模板如下：
-
-```json
-{
-    "enable_user_qos": true,
-    "user_groups": [
-        "Platinum",
-        "Gold",
-        "Silver",
-        "Bronze"
-    ],
-    "user_group_map": {
-        "Platinum": [
-            {
-                "id": "user_id0",
-                "quota_pct": 100
-            },
-            {
-                "id": "default",
-                "quota_pct": 0
-            }
-        ],
-        "Gold": [
-            {
-                "id": "user_id1",
-                "quota_pct": 50
-            },
-            {
-                "id": "user_id2",
-                "quota_pct": 50
-            }
-        ],
-        "Silver": [
-            {
-                "id": "user_id3",
-                "quota_pct": 5
-            },
-            {
-                "id": "default",
-                "quota_pct": 95
-            }
-        ],
-        "Bronze": [
-            {
-                "id": "user_id4",
-                "quota_pct": 30
-            },
-            {
-                "id": "user_id5",
-                "quota_pct": 30
-            },
-            {
-                "id": "user_id6",
-                "quota_pct": 40
-            },
-            {
-                "id": "default",
-                "quota_pct": 0
-            }
-        ]
-    }
-}
-```
-
-### 如何使用 LMDeploy-QoS 感知进行推理
-
-我们提供以下代码链接，展示如何调用具有多租户策略感知的推理请求，在 HTTP Body 中，与 QoS 相关的参数如下：
-
-/v1/chat/interactive_qos
-
-```bash
-curl -X POST http://localhost/v1/chat/interactive_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "prompt": "Hello,Hello",
-  "session_id": -1,
-  "interactive_mode": false,
-  "stream": false,
-  "stop": false,
-  "request_output_len": 512,
-  "top_p": 0.8,
-  "top_k": 40,
-  "temperature": 0.8,
-  "repetition_penalty": 1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-/v1/chat/completions_qos
-
-```bash
-curl -X POST http://localhost/v1/chat/completions_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "internlm-chat-7b",
-  "messages": "Hello,Hello",
-  "temperature": 0.7,
-  "top_p": 1,
-  "n": 1,
-  "max_tokens": 512,
-  "stop": false,
-  "stream": false,
-  "presence_penalty": 0,
-  "frequency_penalty": 0,
-  "repetition_penalty": 1,
-  "session_id": -1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-/v1/completions_qos
-
-```bash
-curl -X POST http://localhost/v1/completions_qos \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "internlm-chat-7b",
-  "prompt": "Hello,Hello",
-  "suffix": "string",
-  "temperature": 0.7,
-  "n": 1,
-  "max_tokens": 16,
-  "stop": "string",
-  "stream": false,
-  "top_p": 1,
-  "repetition_penalty": 1,
-  "session_id": -1,
-  "ignore_eos": false,
-  "user_id": "user_id0"
-}'
-```
-
-### 配置文件修改
-
-配置文件模板路径为：`lmdeploy/server/qos_engine/qos_config.json.template`，可以根据实际需求添加需要配置的用户，设置正确的优先级以及quota值。
-
-### 配置参数传入
-
-启动api_server时，通过`--qos-config-path`，将配置文件及路径传入，示例如下：
-
-```bash
-CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm-chat-7b --server-port 8000 --qos-config-path lmdeploy/serve/qos_engine/qos_config.json.template
-```
-
-### 贡献者
-
-[Eric](https://github.com/rhinouser0), [sallyjunjun](https://github.com/sallyjunjun), [sfireworks](https://github.com/sfireworks), [Dofgal](https://github.com/Dofgal), [shadow](https://github.com/awslshadowstar)
diff --git a/docs/zh_cn/multi_modal/cogvlm.md b/docs/zh_cn/multi_modal/cogvlm.md
index 131ad4f4aa..9810e671d4 100644
--- a/docs/zh_cn/multi_modal/cogvlm.md
+++ b/docs/zh_cn/multi_modal/cogvlm.md
@@ -17,7 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https:
 pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121
 ```
 
-请参考[安装文档](../installation.md)安装 LMDeploy
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy
 
 ### 准备
 
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index c51870d6b2..1abcbc7d06 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -13,7 +13,7 @@ LMDeploy 支持 InternVL 系列模型，具体如下：
 
 ## 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游 InternVL 模型库需的依赖。
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy，并安装上游 InternVL 模型库需的依赖。
 
 ```shell
 pip install timm
diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
index 8b41bd511e..b605bc1fcc 100644
--- a/docs/zh_cn/multi_modal/minicpmv.md
+++ b/docs/zh_cn/multi_modal/minicpmv.md
@@ -11,7 +11,7 @@ LMDeploy 支持 MiniCPM-V 系列模型，具体如下：
 
 ## 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy。
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
 
 ## 离线推理
 
diff --git a/docs/zh_cn/multi_modal/phi3.md b/docs/zh_cn/multi_modal/phi3.md
index 2ed120344b..b5545d30b6 100644
--- a/docs/zh_cn/multi_modal/phi3.md
+++ b/docs/zh_cn/multi_modal/phi3.md
@@ -13,7 +13,7 @@
 
 ## 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy，并安装该模型的依赖。
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy，并安装该模型的依赖。
 
 ```shell
 # 建议从https://github.com/Dao-AILab/flash-attention/releases寻找和环境匹配的whl包
diff --git a/docs/zh_cn/multi_modal/xcomposer2d5.md b/docs/zh_cn/multi_modal/xcomposer2d5.md
index 31973a4941..033d25c8ac 100644
--- a/docs/zh_cn/multi_modal/xcomposer2d5.md
+++ b/docs/zh_cn/multi_modal/xcomposer2d5.md
@@ -8,7 +8,7 @@
 
 ### 安装
 
-请参考[安装文档](../installation.md)安装 LMDeploy，并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。
+请参考[安装文档](../get_started/installation.md)安装 LMDeploy，并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。
 
 ```shell
 pip install decord
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index 83237b992d..b61b894781 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -9,7 +9,7 @@ LMDeploy TurboMind 引擎支持由 [AWQ](https://arxiv.org/abs/2306.00978) 和 [
 - Ampere(sm80,sm86): 30 系列，A10, A16, A30, A100
 - Ada Lovelace(sm89): 40 系列
 
-在进行量化和推理之前，请确保按照[安装指南](../installation.md)安装了 lmdeploy。
+在进行量化和推理之前，请确保按照[安装指南](../get_started/installation.md)安装了 lmdeploy。
 
 本文的其余部分由以下章节组成：
 
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
deleted file mode 100644
index ed07d8038d..0000000000
--- a/examples/cpp/llama/llama_config.ini
+++ /dev/null
@@ -1,82 +0,0 @@
-[ft_instance_hyperparameter]
-data_type=fp16
-enable_custom_all_reduce=0
-pipeline_para_size=1
-tensor_para_size=1
-; update model_dir path according to the actual situation
-model_dir=/workspace/models/triton_models/weights/
-
-
-[request]
-request_batch_size=8
-max_input_len=1
-request_output_len=2048
-beam_width=1 ; beam width for beam search
-top_k=1 ; k value for top k sampling
-top_p=0.0 ; p value for top p sampling
-temperature=1.0 ; Use for sampling
-repetition_penalty=1.00 ; Use for sampling
-presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
-len_penalty=0.0
-beam_search_diversity_rate=0.0
-; PJLM start/end ids
-start_id=0
-end_id=1
-
-
-; --------------------- legacy params -------------------------
-
-; LLaMA start/end ids
-; start_id=1
-; end_id=2
-
-[4999_llama]
-head_num=80
-size_per_head=128
-vocab_size=65632
-num_layer=82
-rotary_embedding=128
-norm_eps=1e-5
-start_id=0
-end_id=1
-inter_size=27392
-
-[llama_7B]
-head_num=32
-size_per_head=128
-vocab_size=32000
-num_layer=32
-rotary_embedding=128
-start_id=1
-end_id=2
-inter_size=11008
-
-[llama_13B]
-head_num=40
-size_per_head=128
-vocab_size=32000
-num_layer=40
-rotary_embedding=128
-start_id=1
-end_id=2
-inter_size=13824
-
-[llama_30B]
-head_num=52
-size_per_head=128
-vocab_size=32000
-num_layer=60
-rotary_embedding=128
-start_id=1
-end_id=2
-inter_size=17920
-
-[llama_65B]
-head_num=64
-size_per_head=128
-vocab_size=32000
-num_layer=80
-rotary_embedding=128
-start_id=1
-end_id=2
-inter_size=22016
diff --git a/examples/cpp/llama/llama_config.yaml b/examples/cpp/llama/llama_config.yaml
new file mode 100644
index 0000000000..463614de06
--- /dev/null
+++ b/examples/cpp/llama/llama_config.yaml
@@ -0,0 +1,24 @@
+ft_instance_hyperparameter:
+    data_type: fp16
+    enable_custom_all_reduce: 0
+    pipeline_para_size: 1
+    tensor_para_size: 1
+     # update model_dir path according to the actual situation
+    model_dir: /workspace/models/triton_models/weights/
+
+
+request:
+    request_batch_size: 8
+    max_input_len: 1
+    request_output_len: 2048
+    beam_width: 1  # beam width for beam search
+    top_k: 1  # k value for top k sampling
+    top_p: 0.0  # p value for top p sampling
+    temperature: 1.0  # Use for sampling
+    repetition_penalty: 1.00  # Use for sampling
+    presence_penalty: 0.0  # Only one of repetition_penalty and presence_penalty are allowed.
+    len_penalty: 0.0
+    beam_search_diversity_rate: 0.0
+    # PJLM start/end ids
+    start_id: 0
+    end_id: 1
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
index 11036cf1b7..b0e513410e 100644
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -18,7 +18,7 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc
 
-#include "3rdparty/INIReader.h"
+#include <yaml-cpp/yaml.h>
 #include <chrono>
 #include <memory>
 #include <thread>
@@ -254,20 +254,24 @@ int read_start_ids(size_t            batch_size,
                    std::string       file_name);
 
 std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
-prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, const std::string& csv_name)
+prepareRequest(std::string config_file, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, const std::string& csv_name)
 {
-    INIReader reader = INIReader(ini_name);
-    if (reader.ParseError() < 0) {
-        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+    YAML::Node reader;
+    try {
+        reader = YAML::Load(config_file);
+    }
+    catch (const YAML::Exception& e) {
+        std::cerr << "Error reading YAML config: " << e.what() << std::endl;
         ft::FT_CHECK(false);
     }
+    auto request = reader["request"];
 
-    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+    const size_t request_batch_size = request["request_batch_size"].as<int>();
     std::cerr << "request_batch_size=" << request_batch_size << "\n";
 
-    const int start_id      = reader.GetInteger("request", "start_id");
-    const int end_id        = reader.GetInteger("request", "end_id");
-    const int max_input_len = reader.GetInteger("request", "max_input_len");
+    const int start_id      = request["start_id"].as<int>();
+    const int end_id        = request["end_id"].as<int>();
+    const int max_input_len = request["max_input_len"].as<int>();
 
     std::vector<int> v_start_ids;
     std::vector<int> v_start_lengths;
@@ -289,16 +293,16 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
     std::vector<int> v_bad_words;
 
     RequestParam param;
-    param.beam_width                 = reader.GetInteger("request", "beam_width");
-    param.request_output_len         = reader.GetInteger("request", "request_output_len");
-    param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
-    param.runtime_top_k              = reader.GetInteger("request", "top_k");
-    param.runtime_top_p              = reader.GetFloat("request", "top_p");
-    param.temperature                = reader.GetFloat("request", "temperature");
-    param.len_penalty                = reader.GetFloat("request", "len_penalty");
-    param.repetition_penalty         = reader.GetFloat("request", "repetition_penalty", 1.0f);
-    param.presence_penalty           = reader.GetFloat("request", "presence_penalty", 0.0f);
-    param.min_length                 = reader.GetInteger("request", "min_length", 0);
+    param.beam_width                 = request["beam_width"].as<int>();
+    param.request_output_len         = request["request_output_len"].as<int>();
+    param.beam_search_diversity_rate = request["beam_search_diversity_rate"].as<float>();
+    param.runtime_top_k              = request["top_k"].as<int>();
+    param.runtime_top_p              = request["top_p"].as<float>();
+    param.temperature                = request["temperature"].as<float>();
+    param.len_penalty                = request["len_penalty"].as<float>();
+    param.repetition_penalty         = request["repetition_penalty"].as<float>(1.0f);
+    param.presence_penalty           = request["presence_penalty"].as<float>(0.0f);
+    param.min_length                 = request["min_length"].as<int>(0);
     param.random_seed                = (unsigned long long int)0;
     param.start_id                   = start_id;
     param.end_id                     = end_id;
@@ -361,11 +365,11 @@ int main(int argc, char* argv[])
     // Note: Only supports that all nodes have same gpu count
     const int   gpu_count  = ft::getDeviceCount();
     const int   world_size = node_num * gpu_count;
-    printf("Recommend to specify the first parameter on the command line as the path to llama_config.ini\n");
-    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini";
+    printf("Recommend to specify the first parameter on the command line as the path to llama_config.yaml\n");
+    std::string config_file   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.yaml";
 
     // step 1: Create model
-    std::shared_ptr<AbstractTransformerModel> model              = AbstractTransformerModel::createLlamaModel(ini_name);
+    std::shared_ptr<AbstractTransformerModel> model              = AbstractTransformerModel::createLlamaModel(config_file);
     int                                       tensor_para_size   = model->getTensorParaSize();
     int                                       pipeline_para_size = model->getPipelineParaSize();
     printf(
@@ -406,7 +410,7 @@ int main(int argc, char* argv[])
     std::vector<void*> pointer_record;  // Used to prevent the pointers are
                                         // release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, csv_name);
+        prepareRequest(config_file, node_id, gpu_count, &pointer_record, csv_name);
     printf("[INFO] request is created \n");
 
     // step 5: Forward
diff --git a/lmdeploy/__init__.py b/lmdeploy/__init__.py
index a9a6ee095a..df64717919 100644
--- a/lmdeploy/__init__.py
+++ b/lmdeploy/__init__.py
@@ -1,16 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from .api import client, pipeline, serve
-from .messages import (EngineGenerationConfig, GenerationConfig,
-                       PytorchEngineConfig, TurbomindEngineConfig,
-                       VisionConfig)
+from .messages import (GenerationConfig, PytorchEngineConfig,
+                       TurbomindEngineConfig, VisionConfig)
 from .model import ChatTemplateConfig
 from .tokenizer import Tokenizer
 from .version import __version__, version_info
 
 __all__ = [
     'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig',
-    'EngineGenerationConfig', '__version__', 'version_info',
-    'ChatTemplateConfig', 'PytorchEngineConfig', 'TurbomindEngineConfig',
-    'VisionConfig'
+    '__version__', 'version_info', 'ChatTemplateConfig', 'PytorchEngineConfig',
+    'TurbomindEngineConfig', 'VisionConfig'
 ]
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 64b714765a..50757d50d4 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -157,16 +157,16 @@ def get_model_arch(model_path: str):
     """
     if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
         # the turbomind model
-        import configparser
+        import yaml
         config_file = os.path.join(model_path, 'triton_models', 'weights',
-                                   'config.ini')
-        config = configparser.ConfigParser()
-        config.read(config_file)
-        model_arch = config['llama']['model_arch']
-        tm_config = TurbomindEngineConfig()
-        for key in config['llama']:
-            setattr(tm_config, key, config['llama'][key])
-        return model_arch, tm_config
+                                   'config.yaml')
+        with open(config_file, 'r') as f:
+            config = yaml.safe_load(f)
+
+        from .turbomind.deploy.config import TurbomindModelConfig
+        tm_config = TurbomindModelConfig.from_dict(config)
+
+        return tm_config.model_config.model_arch, tm_config
     else:
         # transformers model
         try:
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index 9aa6000505..1239f9d365 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -48,6 +48,34 @@ def add_parser_auto_awq():
             default=128,
             help='Group size for weight quantization statistics')
 
+    @staticmethod
+    def add_parser_auto_gptq():
+        """Add parser for auto_gptq command."""
+        parser = SubCliLite.subparsers.add_parser(
+            'auto_gptq',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.auto_gptq.__doc__,
+            help=SubCliLite.auto_gptq.__doc__)
+        parser.set_defaults(run=SubCliLite.auto_gptq)
+        parser.add_argument('model',
+                            type=str,
+                            help='The path of model in hf format')
+        ArgumentHelper.revision(parser)
+        ArgumentHelper.work_dir(parser)
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.calib_batchsize(parser)
+        parser.add_argument('--w-bits',
+                            type=int,
+                            default=4,
+                            help='Bit number for weight quantization')
+        parser.add_argument(
+            '--w-group-size',
+            type=int,
+            default=128,
+            help='Group size for weight quantization statistics')
+
     @staticmethod
     def add_parser_calibrate():
         """Add parser for calibrate command."""
@@ -97,6 +125,13 @@ def auto_awq(args):
         kwargs = convert_args(args)
         auto_awq(**kwargs)
 
+    @staticmethod
+    def auto_gptq(args):
+        """Perform weight quantization using GPTQ algorithm."""
+        from lmdeploy.lite.apis.gptq import auto_gptq
+        kwargs = convert_args(args)
+        auto_gptq(**kwargs)
+
     @staticmethod
     def calibrate(args):
         """Perform calibration on a given dataset."""
@@ -115,5 +150,6 @@ def smooth_quant(args):
     def add_parsers():
         """Add all parsers."""
         SubCliLite.add_parser_auto_awq()
+        SubCliLite.add_parser_auto_gptq()
         SubCliLite.add_parser_calibrate()
         SubCliLite.add_parser_smooth_quant()
diff --git a/lmdeploy/lite/apis/gptq.py b/lmdeploy/lite/apis/gptq.py
new file mode 100644
index 0000000000..12b88a52cd
--- /dev/null
+++ b/lmdeploy/lite/apis/gptq.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import torch
+from transformers import AutoTokenizer
+
+from lmdeploy.lite.utils.calib_dataloader import get_calib_loaders
+
+
+def auto_gptq(model: str,
+              work_dir: str = './work_dir',
+              w_bits: int = 4,
+              w_group_size: int = 128,
+              calib_dataset: str = 'ptb',
+              calib_samples: int = 128,
+              calib_seqlen: int = 2048,
+              batch_size: int = 1,
+              revision: str = None):
+    """Perform weight quantization using AWQ algorithm.
+
+    Args:
+        model (str): The path of model in hf format.
+        work_dir (str): The working directory to save results.
+        calib_dataset (str): The calibration dataset name.
+        calib_samples (int): The number of samples for calibration.
+        batch_size (int): The batch size for running the calib samples.
+            Low GPU mem requires small batch_size. Large batch_size
+            reduces the calibration time while costs more VRAM.
+        calib_seqlen (int): The sequence length for calibration.
+        w_bits (int): Bit number for weight quantization.
+        w_group_size (int): Group size for weight quantization statistics.
+        search_scale (bool): Whether search scale ratio. Default to False,
+            which means only smooth quant with 0.5 ratio will be applied.
+        device (str): Device type of running.
+        revision (str): The specific model version to use. It can be a
+            branch name, a tag name, or a commit id. If unspecified,
+            will use the default version.
+    """
+    try:
+        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+    except Exception:
+        raise ImportError('To use auto_gptq, please install auto-gptq by '
+                          'pip install auto-gptq')
+    logging.basicConfig(
+        format='%(asctime)s %(levelname)s [%(name)s] %(message)s',
+        level=logging.INFO,
+        datefmt='%Y-%m-%d %H:%M:%S',
+    )
+    # support internlm2
+    from auto_gptq.modeling import GPTQ_CAUSAL_LM_MODEL_MAP
+    from auto_gptq.modeling._const import SUPPORTED_MODELS
+
+    from ..modeling.internlm2_gptq import InternLM2GPTQForCausalLM
+    SUPPORTED_MODELS.append('internlm2')
+    GPTQ_CAUSAL_LM_MODEL_MAP.update(dict(internlm2=InternLM2GPTQForCausalLM))
+
+    pretrained_model_dir = model
+    quantized_model_dir = work_dir
+
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir,
+                                              trust_remote_code=True)
+    print('Loading calibrate dataset ...')
+    calib_loader, _ = get_calib_loaders(calib_dataset,
+                                        tokenizer,
+                                        nsamples=calib_samples,
+                                        seqlen=calib_seqlen)
+    all_data = [
+        data if isinstance(data, torch.Tensor) else data[0]
+        for data in calib_loader
+    ]
+    attention_mask = [1] * calib_seqlen
+    examples = [
+        dict(input_ids=data.flatten().tolist(), attention_mask=attention_mask)
+        for data in all_data
+    ]
+
+    quantize_config = BaseQuantizeConfig(
+        bits=w_bits,  # quantize model to 4-bit
+        group_size=w_group_size,  # it is recommended to set the value to 128
+        desc_act=False,  # lmdeploy only supports False
+        sym=True,  # lmdeploy only supports True
+    )
+
+    # load un-quantized model, by default,
+    # the model will always be loaded into CPU memory
+    model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir,
+                                                quantize_config,
+                                                revision=revision,
+                                                trust_remote_code=True)
+
+    # quantize model, the examples should be list of dict whose keys
+    # can only be "input_ids" and "attention_mask"
+    model.quantize(examples, batch_size=batch_size)
+
+    # save quantized model
+    model.save_quantized(quantized_model_dir)
+
+    tokenizer.save_pretrained(quantized_model_dir)
+
+
+if __name__ == '__main__':
+    import fire
+
+    fire.Fire(auto_gptq)
diff --git a/lmdeploy/lite/modeling/__init__.py b/lmdeploy/lite/modeling/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/lite/modeling/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/lite/modeling/internlm2_gptq.py b/lmdeploy/lite/modeling/internlm2_gptq.py
new file mode 100644
index 0000000000..a8b493c46a
--- /dev/null
+++ b/lmdeploy/lite/modeling/internlm2_gptq.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from auto_gptq.modeling import BaseGPTQForCausalLM
+
+
+class InternLM2GPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = 'InternLM2DecoderLayer'
+    layers_block_name = 'model.layers'
+    outside_layer_modules = ['model.tok_embeddings', 'model.norm']
+    inside_layer_modules = [
+        ['attention.wqkv'],
+        ['attention.wo'],
+        ['feed_forward.w3', 'feed_forward.w1'],
+        ['feed_forward.w2'],
+    ]
diff --git a/lmdeploy/lite/utils/calib_dataloader.py b/lmdeploy/lite/utils/calib_dataloader.py
index 27cde59f56..b5cf7e1f6a 100644
--- a/lmdeploy/lite/utils/calib_dataloader.py
+++ b/lmdeploy/lite/utils/calib_dataloader.py
@@ -22,8 +22,14 @@ def get_wikitext2(tokenizer, nsamples, seed, seqlen):
         test_enc: Full tokenized Wikitext-2 test set.
     """
     from datasets import load_dataset
-    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
-    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+    traindata = load_dataset('wikitext',
+                             'wikitext-2-raw-v1',
+                             split='train',
+                             trust_remote_code=True)
+    testdata = load_dataset('wikitext',
+                            'wikitext-2-raw-v1',
+                            split='test',
+                            trust_remote_code=True)
 
     trainenc = tokenizer('\n\n'.join(traindata['text']), return_tensors='pt')
     testenc = tokenizer('\n\n'.join(testdata['text']), return_tensors='pt')
@@ -55,10 +61,14 @@ def get_ptb(tokenizer, nsamples, seed, seqlen):
         test_enc: Full tokenized PTB validation set.
     """
     from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
+    traindata = load_dataset('ptb_text_only',
+                             'penn_treebank',
+                             split='train',
+                             trust_remote_code=True)
     valdata = load_dataset('ptb_text_only',
                            'penn_treebank',
-                           split='validation')
+                           split='validation',
+                           trust_remote_code=True)
 
     trainenc = tokenizer('\n\n'.join(traindata['sentence']),
                          return_tensors='pt')
@@ -96,13 +106,15 @@ def get_c4(tokenizer, nsamples, seed, seqlen):
         'allenai--c4',
         data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
         split='train',
-        use_auth_token=False)
+        use_auth_token=False,
+        trust_remote_code=True)
     valdata = load_dataset(
         'allenai/c4',
         'allenai--c4',
         data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
         split='validation',
-        use_auth_token=False)
+        use_auth_token=False,
+        trust_remote_code=True)
 
     import random
     random.seed(seed)
@@ -158,8 +170,14 @@ def get_ptb_new(tokenizer, nsamples, seed, seqlen):
         test_enc: Full tokenized PTB validation set.
     """
     from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
-    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+    traindata = load_dataset('ptb_text_only',
+                             'penn_treebank',
+                             split='train',
+                             trust_remote_code=True)
+    testdata = load_dataset('ptb_text_only',
+                            'penn_treebank',
+                            split='test',
+                            trust_remote_code=True)
 
     trainenc = tokenizer(' '.join(traindata['sentence']), return_tensors='pt')
     testenc = tokenizer(' '.join(testdata['sentence']), return_tensors='pt')
@@ -195,12 +213,14 @@ def get_c4_new(tokenizer, nsamples, seed, seqlen):
         'allenai/c4',
         'allenai--c4',
         data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
-        split='train')
+        split='train',
+        trust_remote_code=True)
     valdata = load_dataset(
         'allenai/c4',
         'allenai--c4',
         data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
-        split='validation')
+        split='validation',
+        trust_remote_code=True)
 
     import random
     random.seed(seed)
@@ -248,7 +268,8 @@ def get_pileval(tokenizer, nsamples, seed, seqlen=512):
     from datasets.builder import DatasetGenerationError
     try:
         dataset = load_dataset('mit-han-lab/pile-val-backup',
-                               split='validation')
+                               split='validation',
+                               trust_remote_code=True)
     except DatasetGenerationError:
         raise InterruptedError('There have been some issues when generating '
                                'the dataset, you could try to download it '
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 865c2249de..4d3c97d718 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -23,6 +23,8 @@ class GenerationConfig:
             input message. **Only 1** is supported now.
         max_new_tokens (int): The maximum number of tokens that can be
             generated in the chat completion
+        do_sample (bool):  Whether or not to use sampling, use greedy
+            decoding otherwise. Default to be False.
         top_p (float): An alternative to sampling with temperature, called
             nucleus sampling, where the model considers the results of the
             tokens with top_p probability mass
@@ -36,51 +38,44 @@ class GenerationConfig:
         random_seed (int): Seed used when sampling a token
         stop_words (List[str]): Words that stop generating further tokens
         bad_words (List[str]): Words that the engine will never generate
+        stop_token_ids (List[int]): List of tokens that stop the generation
+            when they are generated. The returned output will not contain
+            the stop tokens.
+        bad_token_ids (List[str]): List of tokens that the engine will never
+            generate.
         min_new_tokens (int): The minimum numbers of tokens to generate,
             ignoring the number of tokens in the prompt.
         skip_special_tokens (bool): Whether or not to remove special tokens
             in the decoding. Default to be True.
         logprobs (int): Number of log probabilities to return per output token.
-    """
+        response_format (Dict): Only pytorch backend support formatting
+            response. Examples: `{"type": "json_schema", "json_schema": {"name":"test","schema": {"properties": {"name": {"type": "string"}}, "required": ["name"], "type": "object"}}}`
+            or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
+        logits_processors (List[Callable]): Custom logit processors.
+    """  # noqa
 
     n: int = 1
     max_new_tokens: int = 512
+    do_sample: bool = False
     top_p: float = 1.0
-    top_k: int = 1
+    top_k: int = 50
     temperature: float = 0.8
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
     random_seed: int = None
     stop_words: List[str] = None
     bad_words: List[str] = None
+    stop_token_ids: List[int] = None
+    bad_token_ids: List[int] = None
     min_new_tokens: int = None
     skip_special_tokens: bool = True
     logprobs: int = None
+    response_format: Optional[Dict] = None
     logits_processors: Optional[List[LogitsProcessor]] = None
 
-
-@dataclass
-class EngineGenerationConfig(GenerationConfig):
-    """generation parameter used by the inference engines."""
-    stop_words: List[int] = None
-    bad_words: List[int] = None
-
-    @staticmethod
-    def From(gen_config: GenerationConfig, tokenizer: Tokenizer):
-        """convert `GenerationConfig` to `EngineGenerationConfig`
-        Args:
-            gen_config (GenerationConfig): an instance of class `GenerationConfig`
-            tokenizer (Tokenizer): a tokenizer to encode the `stop_words` and `bad_words` in `gen_config`
-
-        Returns:
-            EngineGenerationConfig: the generation config used by inference engines
-
-        Examples:
-            >>> from lmdeploy import Tokenizer, GenerationConfig, EngineGenerationConfig
-            >>> tokenizer = Tokenizer('internlm/internlm-chat-7b')
-            >>> gen_config = GenerationConfig(stop_words=['<eoa>'])
-            >>> gen_config = EngineGenerationConfig.From(gen_config, tokenizer)
-        """  # noqa E501
+    def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer):
+        """convert stop_words/bad_sords to ids and append the ids to
+        stop_token_ids/bad_token_ids."""
 
         def special_word_token_ids(words):
             if words is not None:
@@ -93,21 +88,12 @@ def special_word_token_ids(words):
                 return indexes
             return None
 
-        return EngineGenerationConfig(
-            n=gen_config.n,
-            logprobs=gen_config.logprobs,
-            max_new_tokens=gen_config.max_new_tokens,
-            min_new_tokens=gen_config.min_new_tokens,
-            top_p=gen_config.top_p,
-            top_k=gen_config.top_k,
-            temperature=gen_config.temperature,
-            repetition_penalty=gen_config.repetition_penalty,
-            ignore_eos=gen_config.ignore_eos,
-            random_seed=gen_config.random_seed,
-            skip_special_tokens=gen_config.skip_special_tokens,
-            stop_words=special_word_token_ids(gen_config.stop_words),
-            bad_words=special_word_token_ids(gen_config.bad_words),
-            logits_processors=gen_config.logits_processors)
+        stop_token_ids = special_word_token_ids(self.stop_words) or []
+        bad_token_ids = special_word_token_ids(self.bad_words) or []
+        stop_token_ids.extend(self.stop_token_ids or [])
+        bad_token_ids.extend(self.bad_token_ids or [])
+        self.stop_token_ids = list(set(stop_token_ids)) or None
+        self.bad_token_ids = list(set(bad_token_ids)) or None
 
     def __post_init__(self):
         """Check input validation."""
@@ -123,18 +109,21 @@ class TurbomindEngineConfig:
     """TurboMind Engine config.
 
     Args:
-        model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq],
-            `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth), awq` meaning the quantized model by AWQ.
+        model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq, gptq],
+            `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth),
+            `awq` and `gptq` meaning the quantized model by AWQ and GPTQ, respectively.
+            If it is not specified, i.e. None, it will be extracted from the input model
         tp (int): the number of GPU cards used in tensor parallelism, default to 1
         session_len (int): the max session length of a sequence, default to None
         max_batch_size (int): the max batch size during inference, default to 128
         cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
             For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache.
             For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache
+        cache_chunk_size (int): The policy to apply for KV block from the block manager, default to -1.
         cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
         enable_prefix_caching (bool): enable cache prompts for block reuse, default to False
         quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
-        rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
+        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
         use_logn_attn (bool): whether or not to use log attn: default to False
         download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
         revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
@@ -148,6 +137,7 @@ class TurbomindEngineConfig:
     session_len: Optional[int] = None
     max_batch_size: int = 128
     cache_max_entry_count: float = 0.8
+    cache_chunk_size: int = -1
     cache_block_seq_len: int = 64
     enable_prefix_caching: bool = False
     quant_policy: int = 0
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index a9b1390aa2..2b5ee85edc 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -5,7 +5,7 @@
 from typing import List, Optional
 
 from lmdeploy.archs import get_model_arch
-from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
+from lmdeploy.messages import GenerationConfig, PytorchEngineConfig
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.serve.async_engine import get_names_from_model
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
@@ -52,7 +52,7 @@ def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
 
 def run_chat(model_path: str,
              engine_config: PytorchEngineConfig,
-             gen_config: EngineGenerationConfig = None,
+             gen_config: GenerationConfig = None,
              session_id: int = 1,
              trust_remote_code: bool = True,
              chat_template_config: Optional[ChatTemplateConfig] = None):
@@ -62,7 +62,7 @@ def run_chat(model_path: str,
     Args:
         model_path (str): the huggingface model path.
         engine_config (PytorchEngineConfig): Config of engine.
-        gen_config (EngineGenerationConfig): Config of generation.
+        gen_config (GenerationConfig): Config of generation.
         session_id (int): the identical id of a session.
         trust_remote_code (bool): trust remote code.
     """
@@ -77,7 +77,7 @@ def run_chat(model_path: str,
         adapter_name = next(iter(engine_config.adapters.keys()))
 
     if gen_config is None:
-        gen_config = EngineGenerationConfig()
+        gen_config = GenerationConfig()
 
     nth_round = 1
     step = 0
@@ -113,7 +113,7 @@ def run_chat(model_path: str,
             print(f'{prompt}', end='', flush=True)
             state = DetokenizeState(len(input_ids))
             gen_config.random_seed = seed
-            gen_config.stop_words = stop_words
+            gen_config.stop_token_ids = stop_words
             for outputs in generator.stream_infer(session_id=session_id,
                                                   input_ids=input_ids,
                                                   gen_config=gen_config,
@@ -162,12 +162,12 @@ def main(model_path: str,
     if adapter is not None:
         adapters = dict(default=adapter)
     engine_config = PytorchEngineConfig(tp=tp, adapters=adapters)
-    gen_config = EngineGenerationConfig(max_new_tokens=512,
-                                        top_k=top_k,
-                                        top_p=top_p,
-                                        temperature=temperature,
-                                        repetition_penalty=repetition_penalty,
-                                        ignore_eos=False)
+    gen_config = GenerationConfig(max_new_tokens=512,
+                                  top_k=top_k,
+                                  top_p=top_p,
+                                  temperature=temperature,
+                                  repetition_penalty=repetition_penalty,
+                                  ignore_eos=False)
     chat_template_config = None
     if chat_template is not None and os.path.exists(chat_template):
         chat_template_config = ChatTemplateConfig.from_json(chat_template)
diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py
index 1f14ac92b9..6470e09ed3 100644
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
@@ -23,14 +23,19 @@ def _handle_exception(e: Exception,
 
 
 def check_env_deeplink(device_type: str):
-    """check Deeplink environment if specific device_type is set."""
+    """check Deeplink environment."""
+    try_import_deeplink(device_type)
+
+
+def try_import_deeplink(device_type: str):
+    """import dlinfer if specific device_type is set."""
     deeplink_device_type_list = [
         'ascend',
     ]
     if device_type in deeplink_device_type_list:
         logger = get_logger('lmdeploy')
         try:
-            import deeplink_ext  # noqa: F401
+            import dlinfer.framework.lmdeploy_ext  # noqa: F401
         except Exception as e:
             _handle_exception(e, 'PyTorch', logger)
 
diff --git a/lmdeploy/pytorch/engine/devices/ascend.py b/lmdeploy/pytorch/engine/devices/ascend.py
index a09fa5f655..9c782a3f3d 100644
--- a/lmdeploy/pytorch/engine/devices/ascend.py
+++ b/lmdeploy/pytorch/engine/devices/ascend.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
-from .dipu import DIPUDeviceUtils
+from .base_device_utils import BaseDeviceUtils
 
 
-class ASCENDDeviceUtils(DIPUDeviceUtils):
+class ASCENDDeviceUtils(BaseDeviceUtils):
 
     device = 'ascend'
 
@@ -17,7 +17,8 @@ def update_step_context(cls, step_context):
             single_attention_mask = torch.logical_not(
                 torch.tril(
                     torch.ones(step_context.q_seq_length[i],
-                               step_context.kv_seq_length[i],
+                               step_context.block_offsets.shape[1] *
+                               block_size,
                                dtype=torch.bool).cuda(),
                     diagonal=step_context.kv_seq_length[i] -
                     step_context.q_seq_length[i],
@@ -28,7 +29,7 @@ def update_step_context(cls, step_context):
             block_loc = step_context.block_offsets[i][block_idx]
             token_loc = history_length % block_size
             for _ in range(step_context.q_seq_length[i]):
-                kv_start_indices.append(block_loc * block_size + token_loc)
+                kv_start_indices.append([block_loc * block_size + token_loc])
                 if _ == step_context.q_seq_length[i] - 1:
                     break
                 token_loc = (token_loc + 1) % block_size
@@ -38,4 +39,11 @@ def update_step_context(cls, step_context):
             kv_start_indices, device=step_context.block_offsets.device)
         setattr(step_context, 'kv_start_indices', kv_start_indices)
         setattr(step_context, 'attention_mask', attention_mask)
+        setattr(step_context, 'q_start_loc', step_context.q_start_loc.cpu())
+        setattr(step_context, 'q_seq_length', step_context.q_seq_length.cpu())
+        setattr(step_context, 'kv_seq_length',
+                step_context.kv_seq_length.cpu())
+        is_unpaged_prefill = (not step_context.is_decoding) and all(
+            (step_context.q_seq_length == step_context.kv_seq_length).tolist())
+        setattr(step_context, 'is_unpaged_prefill', is_unpaged_prefill)
         return step_context
diff --git a/lmdeploy/pytorch/engine/devices/dipu.py b/lmdeploy/pytorch/engine/devices/dipu.py
deleted file mode 100644
index d2cc9c4243..0000000000
--- a/lmdeploy/pytorch/engine/devices/dipu.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_device_utils import BaseDeviceUtils
-
-
-class DIPUDeviceUtils(BaseDeviceUtils):
-
-    device = 'dipu'
-
-    @classmethod
-    def update_step_context(cls, step_context):
-        """update step context."""
-        raise NotImplementedError('`update_step_context` of '
-                                  f'<{cls}> not implemented.')
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index d37ab1b8c7..4a8d02cd7d 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -7,7 +7,7 @@
 import numpy as np
 import torch
 
-from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
                                ResponseType)
 from lmdeploy.utils import get_logger, get_model, logging_timer
 
@@ -500,6 +500,7 @@ def _batch_stopping_criteria(self, token_ids: torch.Tensor,
     @logging_timer('SamplingLogits', logger)
     def async_sampling_logits(self, logits: torch.Tensor,
                               all_ids: torch.Tensor,
+                              guided_input_ids: torch.Tensor,
                               sampling_inputs: SamplingInputs,
                               inputs: ModelInputs, ignore_eos: torch.Tensor):
         """sampling logits."""
@@ -514,8 +515,9 @@ def __get_last_logits():
             return logits[last_idx, :]
 
         split_logits = __get_last_logits().cuda()
-        logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos)
-        logits = logits_processor(split_logits, all_ids)
+        logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos,
+                                                self.tokenizer.model.model)
+        logits = logits_processor(all_ids, guided_input_ids, split_logits)
         next_token_ids = logits_processor.sampling(logits)
 
         return next_token_ids
@@ -679,7 +681,8 @@ def __get_q_start_loc():
 
     async def _async_step_background(
             self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict,
-            all_ids: torch.Tensor, sampling_inputs: SamplingInputs,
+            all_ids: torch.Tensor, guided_input_ids: torch.Tensor,
+            sampling_inputs: SamplingInputs,
             num_appendable_ids: torch.LongTensor,
             num_ignore_eos: torch.LongTensor, return_logits: bool,
             output_que: asyncio.Queue):
@@ -687,11 +690,16 @@ async def _async_step_background(
 
         def __update_inputs(next_token_ids):
             """update inputs."""
-            nonlocal all_ids
+            nonlocal all_ids, guided_input_ids
             inputs.update(next_token_ids)
             if all_ids is not None:
                 all_ids = torch.cat(
                     [all_ids, next_token_ids[:, None].to(all_ids.device)], 1)
+            if guided_input_ids is not None:
+                guided_input_ids = torch.cat([
+                    guided_input_ids, next_token_ids[:, None].to(
+                        guided_input_ids.device)
+                ], 1)
             if sampling_inputs.random_offsets is not None:
                 sampling_inputs.random_offsets += 1
 
@@ -701,6 +709,8 @@ def __update_inputs(next_token_ids):
         is_decoding = inputs.is_decoding
         if all_ids is not None:
             all_ids = all_ids.cuda()
+        if guided_input_ids is not None:
+            guided_input_ids = guided_input_ids.cuda()
         sampling_inputs = sampling_inputs.to_device('cuda')
         num_appendable_ids = num_appendable_ids.cuda()
         num_ignore_eos = num_ignore_eos.cuda()
@@ -720,7 +730,8 @@ def __update_inputs(next_token_ids):
 
             # sampling
             next_token_ids = self.async_sampling_logits(
-                logits, all_ids, sampling_inputs, inputs, num_ignore_eos > 0)
+                logits, all_ids, guided_input_ids, sampling_inputs, inputs,
+                num_ignore_eos > 0)
             num_ignore_eos = num_ignore_eos - 1
 
             # stopping criteria
@@ -766,6 +777,24 @@ def __gather_all_ids(seqs: SeqList, sampling_inputs: SamplingInputs):
                 output[idx, -h_len:] = h_ids
             return output
 
+        def __gather_guided_input_ids(seqs: SeqList,
+                                      sampling_inputs: SamplingInputs):
+            """gather input ids for guided decode."""
+            if not any(sampling_inputs.response_formats or ()):
+                return None
+            batch = len(seqs)
+            max_len = max(seq.num_new_tokens for seq in seqs)
+            pad_id = self.model_config.bos_token_id
+            pad_id = 0 if pad_id is None else pad_id
+            output = torch.full((batch, max_len), pad_id, dtype=torch.int64)
+            for idx, seq in enumerate(seqs):
+                h_len = seq.num_new_tokens
+                if h_len == 0:
+                    continue
+                h_ids = torch.from_numpy(seq.all_ids[-seq.num_new_tokens:])
+                output[idx, -h_len:] = h_ids
+            return output
+
         def __get_num_appendable_ids(seqs: SeqList):
             """get num appendable ids."""
             ret = [
@@ -802,6 +831,8 @@ def __need_logits(seqs: SeqList):
                                                   is_prefill)
                 sampling_inputs = SamplingInputs.from_sampling_params(running)
                 all_ids = __gather_all_ids(running, sampling_inputs)
+                guided_input_ids = __gather_guided_input_ids(
+                    running, sampling_inputs)
                 num_appendable_ids = __get_num_appendable_ids(running)
                 num_ignore_eos = __get_num_ignore_eos(running)
                 return_logits = __need_logits(running)
@@ -814,6 +845,7 @@ def __need_logits(seqs: SeqList):
                     swap_in_map=schedule_output.swap_in_map,
                     swap_out_map=schedule_output.swap_out_map,
                     all_ids=all_ids,
+                    guided_input_ids=guided_input_ids,
                     sampling_inputs=sampling_inputs,
                     num_appendable_ids=num_appendable_ids,
                     num_ignore_eos=num_ignore_eos,
@@ -912,7 +944,7 @@ async def async_batched_infer(
             self,
             session_ids: List[int],
             token_ids: List[List[int]] = None,
-            gen_config: EngineGenerationConfig = None,
+            gen_config: GenerationConfig = None,
             adapter_names: List[str] = None,
             keep_cache: bool = False,
             input_embeddings: List[InputEmbeddingType] = None,
@@ -922,7 +954,7 @@ async def async_batched_infer(
         Args:
             session_ids (List[int]): The session id.
             token_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
             adapter_names (List[str]): The name of the adapters.
             keep_cache (bool): Keep kv cache after infer.
 
@@ -944,7 +976,7 @@ def batched_infer(
             self,
             session_ids: List[int],
             token_ids: List[List[int]] = None,
-            gen_config: EngineGenerationConfig = None,
+            gen_config: GenerationConfig = None,
             adapter_names: List[str] = None,
             keep_cache: bool = False,
             input_embeddings: List[InputEmbeddingType] = None,
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index 9d9ebf9198..4758585eca 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import List
 
-from lmdeploy.messages import EngineGenerationConfig, EngineOutput
+from lmdeploy.messages import EngineOutput, GenerationConfig
 from lmdeploy.utils import get_logger
 
 from ..messages import (InputEmbeddingRangeType, InputEmbeddings,
@@ -129,7 +129,7 @@ async def async_stream_infer(
             self,
             session_id: int,
             input_ids: List[int],
-            gen_config: EngineGenerationConfig = None,
+            gen_config: GenerationConfig = None,
             adapter_name: str = None,
             input_embeddings: InputEmbeddingType = None,
             input_embedding_ranges: InputEmbeddingRangeType = None,
@@ -139,7 +139,7 @@ async def async_stream_infer(
         Args:
             session_id (int): The session id.
             input_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
             adapter_name (str): The lora adapter name.
 
         Yields:
@@ -150,7 +150,7 @@ async def async_stream_infer(
         if len(input_ids) > self.max_input_len:
             yield EngineOutput(ResponseType.INPUT_LENGTH_ERROR, [], 0)
             return
-        gen_config = gen_config or EngineGenerationConfig()
+        gen_config = gen_config or GenerationConfig()
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
         await self.req_sender.async_send_async(
             RequestType.ADD_SESSION, dict(session_id=session_id,
@@ -191,7 +191,7 @@ async def async_infer(
             self,
             session_id: int,
             input_ids: List[int] = None,
-            gen_config: EngineGenerationConfig = None,
+            gen_config: GenerationConfig = None,
             input_embeddings: InputEmbeddingType = None,
             input_embedding_ranges: InputEmbeddingRangeType = None,
             **kwargs):
@@ -200,7 +200,7 @@ async def async_infer(
         Args:
             session_id (int): The session id.
             input_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
 
         Returns:
             int: Error flags. 0 if success.
@@ -225,7 +225,7 @@ async def async_infer(
     def stream_infer(self,
                      session_id: int,
                      input_ids: List[int],
-                     gen_config: EngineGenerationConfig = None,
+                     gen_config: GenerationConfig = None,
                      adapter_name: str = None,
                      input_embeddings: InputEmbeddingType = None,
                      input_embedding_ranges: InputEmbeddingRangeType = None,
@@ -235,7 +235,7 @@ def stream_infer(self,
         Args:
             session_id (int): The session id.
             input_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
             adapter_name (str): The lora adapter name.
 
         Yields:
@@ -268,7 +268,7 @@ def __call_async():
             yield from __call_async()
             return
 
-        gen_config = gen_config or EngineGenerationConfig()
+        gen_config = gen_config or GenerationConfig()
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
         self.req_sender.send_async(RequestType.ADD_SESSION,
                                    dict(session_id=session_id, response=False))
@@ -308,7 +308,7 @@ def __call_async():
     def infer(self,
               session_id: int,
               input_ids: List[int] = None,
-              gen_config: EngineGenerationConfig = None,
+              gen_config: GenerationConfig = None,
               input_embeddings: InputEmbeddingType = None,
               input_embedding_ranges: InputEmbeddingRangeType = None,
               **kwargs):
@@ -317,7 +317,7 @@ def infer(self,
         Args:
             session_id (int): The session id.
             input_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
 
         Returns:
             int: Error flags. 0 if success.
@@ -343,7 +343,7 @@ async def async_batched_infer(
         self,
         session_ids: List[int],
         token_ids: List[List[int]] = None,
-        gen_config: EngineGenerationConfig = None,
+        gen_config: GenerationConfig = None,
         adapter_names: List[str] = None,
         keep_cache: bool = False,
         input_embeddings: List[InputEmbeddingType] = None,
@@ -354,7 +354,7 @@ async def async_batched_infer(
         Args:
             session_ids (List[int]): The session id.
             token_ids (List[int]): The input token ids.
-            gen_config (EngineGenerationConfig): The sampling parameters.
+            gen_config (GenerationConfig): The sampling parameters.
             adapter_names (List[str]): The name of the adapters.
             keep_cache (bool): Keep kv cache after infer.
 
@@ -443,7 +443,7 @@ def batched_infer(
         self,
         session_ids: List[int],
         token_ids: List[List[int]] = None,
-        gen_config: EngineGenerationConfig = None,
+        gen_config: GenerationConfig = None,
         adapter_names: List[str] = None,
         keep_cache: bool = False,
         input_embeddings: List[InputEmbeddingType] = None,
diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py
new file mode 100644
index 0000000000..444a809c81
--- /dev/null
+++ b/lmdeploy/pytorch/engine/guided_process.py
@@ -0,0 +1,170 @@
+# Copyright 2024- the Outlines developers
+# This file is adapted from
+# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+import copy
+import math
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from functools import lru_cache
+from typing import DefaultDict, Dict, List, Union
+
+import torch
+from outlines.fsm.guide import CFGGuide, Generate, RegexGuide, Write
+from outlines.fsm.json_schema import build_regex_from_schema
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+
+
+class BaseLogitsProcessor:
+
+    def init_state(self):
+        """Initialize the FSM states."""
+        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
+
+    def __call__(self, input_ids: List[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        """Use the FSM to bias the logits before sampling the next token."""
+
+        seq_id = hash(tuple(input_ids))
+
+        if len(input_ids) == 0:
+            self.init_state()
+        else:
+            last_token = input_ids[-1]
+            last_seq_id = hash(tuple(input_ids[:-1]))
+            self.fsm_state[seq_id] = self.fsm.get_next_state(
+                state=self.fsm_state[last_seq_id], token_id=last_token)
+
+        instruction = self.fsm.get_next_instruction(self.fsm_state[seq_id])
+
+        if type(instruction) == Generate:
+            allowed_tokens = instruction.tokens
+        elif type(instruction) == Write:
+            # TODO: support fast forward tokens
+            allowed_tokens = [instruction.tokens[0]]
+        else:
+            raise TypeError(
+                f'Unsupported instruction type {type(instruction)}')
+
+        mask = torch.full((scores.shape[-1], ),
+                          -math.inf,
+                          device=scores.device)
+        mask[allowed_tokens] = 0
+        scores.add_(mask)
+
+        return scores
+
+    def adapt_tokenizer(self, tokenizer):
+        """Adapt tokenizer to use to compile the FSM.
+
+        The API of Outlines tokenizers is slightly different to that of
+        `transformers`. In addition we need to handle the missing spaces to
+        Llama's tokenizer to be able to compile FSMs for this model.
+        """
+        from outlines.integrations.utils import adapt_tokenizer
+        tokenizer = adapt_tokenizer(tokenizer)
+        # vocab size greater than logits shape because of '[UNUSED_TOKEN_...]'
+        if hasattr(tokenizer, '_tokenizer'):
+            tokenizer.vocabulary = tokenizer._tokenizer.get_vocab(
+                with_added_tokens=False)
+        return tokenizer
+
+
+class RegexLogitsProcessor(BaseLogitsProcessor):
+
+    def __init__(self, regex_string: str, tokenizer):
+        """Compile the FSM that drives the regex-structured generation.
+
+        Args:
+            regex_string: A string that represents a regular expression
+            tokenizer: The model's tokenizer
+        """
+        tokenizer = self.adapt_tokenizer(copy.deepcopy(tokenizer))
+        fsm = RegexGuide(regex_string, tokenizer)
+        self.fsm = fsm
+
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+
+    def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer):
+        """Compile the FSM that drives the JSON-guided generation.
+
+        Args:
+            schema: A str schema that encodes the structure we want the model
+                to generate
+            tokenizer: The model's tokenizer
+        """
+        regex_string = build_regex_from_schema(schema)
+        super().__init__(regex_string, tokenizer)
+
+
+class CFGLogitsProcessor(BaseLogitsProcessor):
+
+    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
+        """Compile the FSM that drives the context free grammar generation.
+
+        Parameters
+        ----------
+        cfg
+            A string that represents a context-free grammar
+        tokenizer
+            The model's tokenizer
+        """
+        tokenizer = self.adapt_tokenizer(tokenizer)
+        fsm = CFGGuide(cfg, tokenizer)
+        self.fsm = fsm
+
+
+# copied from https://github.com/vllm-project/vllm/blob/a7f65c2be93f491771aca31106f790bf381c0bad/vllm/model_executor/guided_decoding/outlines_decoding.py#L31  # noqa
+JSON_GRAMMAR = r"""
+?start: object | array
+
+?value: object
+| array
+| UNESCAPED_STRING
+| SIGNED_NUMBER      -> number
+| "true"             -> true
+| "false"            -> false
+| "null"             -> null
+
+array  : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair   : UNESCAPED_STRING ":" value
+
+%import common.UNESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+
+%ignore WS
+"""
+
+
+@lru_cache(maxsize=32)
+def _get_guided_logits_processor(guide: str,
+                                 tokenizer: PreTrainedTokenizerBase,
+                                 type: str):
+    try:
+        if type == 'json_object':
+            return CFGLogitsProcessor(guide, tokenizer)
+        elif type == 'json_schema':
+            return JSONLogitsProcessor(guide, tokenizer)
+        elif type == 'regex_schema':
+            return RegexLogitsProcessor(guide, tokenizer)
+        else:
+            return None
+    except Exception as e:
+        from lmdeploy.utils import get_logger
+        logger = get_logger('lmdeploy')
+        logger.error(e)
+        return None
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
index 52f99afa35..83c33faaf9 100644
--- a/lmdeploy/pytorch/engine/logits_process.py
+++ b/lmdeploy/pytorch/engine/logits_process.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import json
 from dataclasses import asdict, dataclass
-from typing import List
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from transformers.generation.logits_process import LogitsWarper
 
 from lmdeploy.messages import LogitsProcessor
+from lmdeploy.tokenizer import Tokenizer
 
 from ..messages import SchedulerSequence
 
@@ -95,6 +97,40 @@ def _multinomial_sampling(scores: torch.Tensor,
     return multinomial_sampling(scores, seeds, offsets, indices)
 
 
+def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor,
+                     guided_input_ids: Optional[torch.Tensor],
+                     tokenizer: object):
+    if guided_input_ids is None:
+        return scores
+    for i in range(len(response_formats)):
+        _format = response_formats[i]
+        if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
+            if _format['type'] == 'json_schema':
+                schema = _format['json_schema']
+                if isinstance(schema, Dict):
+                    for key in ['json_schema', 'schema']:
+                        if key in schema:
+                            schema = json.dumps(schema[key])
+                elif schema is None:
+                    from .guided_process import JSON_GRAMMAR
+                    schema = JSON_GRAMMAR
+                elif isinstance(schema, str):
+                    raise ValueError(
+                        f'Cannot parse schema {schema}. The schema must be '
+                        'either a dictionary or a string that contains the'
+                        ' JSON Schema specification')
+            elif _format['type'] == 'regex_schema':
+                schema = _format.get('regex_schema', '')
+            else:
+                raise ValueError(f"unsupported format type: {_format['type']}")
+            from .guided_process import _get_guided_logits_processor
+            processor = _get_guided_logits_processor(schema, tokenizer,
+                                                     _format['type'])
+            if processor:
+                scores[i] = processor(guided_input_ids[i].tolist(), scores[i])
+    return scores
+
+
 @dataclass
 class SamplingInputs:
     temperature: torch.Tensor = None
@@ -107,6 +143,7 @@ class SamplingInputs:
     random_offsets: int = None
     max_top_k: int = 1
     min_top_p: float = 1.0
+    response_formats: Tuple[str] = ()
     logits_processors: List[List[LogitsProcessor]] = None
 
     @classmethod
@@ -121,6 +158,7 @@ def from_sampling_params(cls, seqs: List[SchedulerSequence]):
         stop_words = [None] * batch_size
         random_seeds = [torch.seed() & 0xffffffff] * batch_size
         random_offsets = [None] * batch_size
+        response_formats = [None] * batch_size
         logits_processors = [None] * batch_size
 
         def __gather_params():
@@ -132,6 +170,7 @@ def __gather_params():
                 top_k[idx] = param.top_k
                 top_p[idx] = param.top_p
                 random_offsets[idx] = seq.random_offsets
+                response_formats[idx] = param.response_format
                 if param.random_seed is not None:
                     random_seeds[idx] = param.random_seed & 0xffffffff
 
@@ -204,6 +243,7 @@ def __get_bad_words(bad_words):
             top_p=top_p,
             random_seeds=random_seeds,
             random_offsets=random_offsets,
+            response_formats=tuple(response_formats),
             max_top_k=max_top_k,
             min_top_p=min_top_p,
             logits_processors=logits_processors,
@@ -235,21 +275,26 @@ def _apply_custom_logits_processors(batched_logits_processors, all_ids,
 class FusedLogitsProcessor(LogitsWarper):
     """Custom logits processor."""
 
-    def __init__(self, sampling_inputs: SamplingInputs,
-                 ignore_eos: torch.Tensor):
+    def __init__(self,
+                 sampling_inputs: SamplingInputs,
+                 ignore_eos: torch.Tensor,
+                 tokenizer: Optional[Tokenizer] = None):
         self.sampling_inputs: SamplingInputs = sampling_inputs
         self.ignore_eos = ignore_eos
+        self.tokenizer = tokenizer
 
-    def __call__(self, scores: torch.FloatTensor,
-                 all_ids: torch.LongTensor) -> torch.FloatTensor:
+    def __call__(self, all_ids: torch.LongTensor,
+                 guided_input_ids: torch.LongTensor,
+                 scores: torch.FloatTensor) -> torch.FloatTensor:
         r"""
         Args:
+            all_ids (torch.LongTensor): All the token ids.
+            guided_input_ids (torch.LongTensor): Guided prompt ids.
             scores (torch.FloatTensor):
                 Prediction scores of a language modeling head.
                 These can be logits for each vocabulary when not using
                 beam search or log softmax for each vocabulary token
                 when using beam search
-            all_ids (torch.LongTensor): All the token ids.
 
 
         Return:
@@ -282,6 +327,8 @@ def __call__(self, scores: torch.FloatTensor,
             stop_words = torch.where(self.ignore_eos[:, None], stop_words, -1)
             scores = _process_bad_words(scores, stop_words)
 
+        scores = _guided_sampling(sampling_inputs.response_formats, scores,
+                                  guided_input_ids, self.tokenizer)
         return scores
 
     def sampling(self, logits: torch.Tensor):
diff --git a/lmdeploy/pytorch/kernels/ascend/__init__.py b/lmdeploy/pytorch/kernels/ascend/__init__.py
index bd207a1ecb..8ab92e0158 100644
--- a/lmdeploy/pytorch/kernels/ascend/__init__.py
+++ b/lmdeploy/pytorch/kernels/ascend/__init__.py
@@ -1,6 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from ..dipu import (apply_rotary_pos_emb, fill_kv_cache, fused_rotary_emb,
-                    multinomial_sampling, paged_attention_fwd, rms_norm)
+from ..default import multinomial_sampling
+from .apply_rotary_pos_emb import apply_rotary_pos_emb
+from .fill_kv_cache import fill_kv_cache
+from .fused_rotary_emb import fused_rotary_emb
+from .moe_gating_topk_softmax import moe_gating_topk_softmax
+from .pagedattention import paged_attention_fwd
+from .rms_norm import rms_norm
 
 __all__ = [
     'rms_norm',
@@ -8,5 +13,6 @@
     'fused_rotary_emb',
     'fill_kv_cache',
     'paged_attention_fwd',
+    'moe_gating_topk_softmax',
     'multinomial_sampling',
 ]
diff --git a/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py
new file mode 100644
index 0000000000..4a4039c44d
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def apply_rotary_pos_emb(
+    query_states: Tensor,
+    key_states: Tensor,
+    cos: Tensor,
+    sin: Tensor,
+    position_ids: Tensor,
+    position_ids_1d: Tensor,
+    q_embed=None,
+    k_embed=None,
+    context=None,
+):
+    bs, head, dim = query_states.shape
+    num_kv_heads = key_states.shape[1]
+    query_states_reshaped = query_states.reshape(1, bs, head, dim)
+    key_states_reshaped = key_states.reshape(1, bs, num_kv_heads, dim)
+    if not (hasattr(context, 'cos') or hasattr(context, 'sin')):
+        if len(cos.shape) == 3 and len(sin.shape) == 3:
+            cos = cos[:, position_ids_1d].view(1, bs, 1, -1)
+            sin = sin[:, position_ids_1d].view(1, bs, 1, -1)
+        elif len(cos.shape) == 2 and len(sin.shape) == 2:
+            cos = cos[position_ids_1d].view(1, bs, 1, -1)
+            sin = sin[position_ids_1d].view(1, bs, 1, -1)
+        else:
+            raise RuntimeError('Cannot handle cos/sin shape dims!')
+
+        if context:
+            setattr(context, 'cos', cos)
+            setattr(context, 'sin', sin)
+    cached_cos = context.cos if context else cos
+    cached_sin = context.sin if context else sin
+    query_states, key_states = ext_ops.apply_rotary_pos_emb(
+        query_states_reshaped, key_states_reshaped, cached_cos, cached_sin,
+        None, None)
+    query_states = query_states.view(bs, head, dim)
+    key_states = key_states.view(bs, num_kv_heads, dim)
+    if q_embed is None:
+        q_embed = query_states
+    else:
+        q_embed.copy_(query_states)
+    if k_embed is None:
+        k_embed = key_states
+    else:
+        k_embed.copy_(key_states)
+    return q_embed, k_embed
diff --git a/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py b/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py
new file mode 100644
index 0000000000..333e500532
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def fill_kv_cache(
+    key_states: Tensor,
+    value_states: Tensor,
+    key_caches: Tensor,
+    value_caches: Tensor,
+    q_start_loc: Tensor,
+    q_seq_length: Tensor,
+    kv_seq_length: Tensor,
+    max_q_seq_length: int,
+    block_offsets: Tensor,
+    context: None,
+):
+    """fill key/value state to cache for paged attention."""
+    ext_ops.fill_kv_cache(key_states, value_states, key_caches, value_caches,
+                          context.kv_start_indices)
diff --git a/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py
new file mode 100644
index 0000000000..03fa2910af
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+import torch
+from torch import Tensor
+
+
+def fused_rotary_emb(
+    query_states: Tensor,
+    key_states: Tensor,
+    position_ids: torch.LongTensor,
+    inv_freq: Tensor,
+    scaling_factor: float,
+    out_q: Tensor = None,
+    out_k: Tensor = None,
+    context=None,
+):
+    batch, seqlen, head, dim = query_states.shape
+    num_kv_heads = key_states.shape[-2]
+    query_states_reshaped = query_states.view(batch, seqlen, head, dim)
+    key_states_reshaped = key_states.view(batch, seqlen, num_kv_heads, dim)
+    position_ids = position_ids.squeeze(0).unsqueeze(-1)
+    pos_freq = position_ids / scaling_factor * inv_freq
+    if not (hasattr(context, 'cos') or hasattr(context, 'sin')):
+        cos = (torch.cos(pos_freq).view(batch, seqlen, 1,
+                                        -1).repeat(1, 1, 1,
+                                                   2).to(query_states.dtype))
+        sin = (torch.sin(pos_freq).view(batch, seqlen, 1,
+                                        -1).repeat(1, 1, 1,
+                                                   2).to(query_states.dtype))
+        if context:
+            setattr(context, 'cos', cos)
+            setattr(context, 'sin', sin)
+    cached_cos = context.cos if context else cos
+    cached_sin = context.sin if context else sin
+    ext_ops.apply_rotary_pos_emb(query_states_reshaped, key_states_reshaped,
+                                 cached_cos, cached_sin, None, None)
+    if out_q is None:
+        out_q = query_states
+    else:
+        out_q.copy_(query_states)
+    if out_k is None:
+        out_k = key_states
+    else:
+        out_k.copy_(key_states)
+    return out_q, out_k
diff --git a/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py
new file mode 100644
index 0000000000..87b5ad1b39
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+import torch
+from torch import Tensor
+
+
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int):
+    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(
+        router_logits, topk)
+    return routing_weights.to(torch.float32), selected_experts.to(torch.int64)
diff --git a/lmdeploy/pytorch/kernels/ascend/pagedattention.py b/lmdeploy/pytorch/kernels/ascend/pagedattention.py
new file mode 100644
index 0000000000..aa2609e476
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/pagedattention.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+import torch
+from torch import Tensor
+
+
+def prefill_attention(
+    query_states: Tensor,
+    key_states: Tensor,
+    value_states: Tensor,
+    attn_output: Tensor,
+    key_cache: Tensor,
+    value_cache: Tensor,
+    block_offsets: Tensor,
+    q_start_loc: Tensor,
+    q_seq_len: Tensor,
+    kv_seq_len: Tensor,
+    block_size: int,
+    kv_cache_len: int,
+    context=None,
+):
+    num_q_heads, dim = query_states.shape[1:3]
+    num_kv_heads = value_states.shape[1]
+
+    if context.is_unpaged_prefill:
+        ext_ops.prefill_attention(
+            query_states,
+            key_states,
+            value_states,
+            q_start_loc,
+            q_seq_len,
+            context.max_q_seq_length,
+            num_q_heads,
+            num_kv_heads,
+            attn_mask=context.attention_mask,
+            attn_output=attn_output,
+        )
+    else:
+        key_cache = key_cache.reshape(1, kv_cache_len, num_kv_heads * dim)
+        value_cache = value_cache.reshape(1, kv_cache_len, num_kv_heads * dim)
+        ext_ops.paged_prefill_attention(
+            query_states,
+            key_cache,
+            value_cache,
+            block_offsets,
+            block_size,
+            q_start_loc,
+            q_seq_len,
+            kv_seq_len,
+            num_q_heads,
+            num_kv_heads,
+            attn_mask=context.attention_mask,
+            attn_output=attn_output,
+        )
+
+
+def paged_decode_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
+                           max_kv_seq_len, block_offsets, block_size):
+    num_kv_heads, num_q_heads = k_cache.shape[1], q.shape[1]
+    ext_ops.paged_decode_attention(
+        q,
+        k_cache,
+        v_cache,
+        block_offsets,
+        block_size,
+        kv_seq_len,
+        max_kv_seq_len,
+        num_q_heads,
+        num_kv_heads,
+        attn_output=attn_output.view(q.shape),
+    )
+
+
+def paged_attention_fwd(
+    query_states: Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    key_cache: Tensor,
+    value_cache: Tensor,
+    attn_output: Tensor,
+    block_offsets: Tensor,
+    q_start_loc: Tensor,
+    q_seqlens: Tensor,
+    kv_seqlens: Tensor,
+    max_seqlen: int,
+    window_size: int = 1,
+    context=None,
+):
+    is_decoding = query_states.shape[-3] == q_seqlens.size(0)
+    block_num, block_size, head, dim = key_cache.size()
+    kv_cache_len = block_num * block_size
+    k = key_cache.reshape(block_num * block_size, head, dim)
+    v = value_cache.reshape(block_num * block_size, head, dim)
+    if not is_decoding:
+        prefill_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_output,
+            k,
+            v,
+            block_offsets,
+            q_start_loc,
+            q_seqlens,
+            kv_seqlens,
+            block_size,
+            kv_cache_len,
+            context=context,
+        )
+    else:
+        paged_decode_attention(
+            query_states,
+            k,
+            v,
+            attn_output,
+            kv_seqlens,
+            context.max_kv_seq_length,
+            block_offsets,
+            block_size,
+        )
diff --git a/lmdeploy/pytorch/kernels/ascend/rms_norm.py b/lmdeploy/pytorch/kernels/ascend/rms_norm.py
new file mode 100644
index 0000000000..57b2f26c21
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/ascend/rms_norm.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import dlinfer.ops as ext_ops
+from torch import Tensor
+
+
+def rms_norm(hidden_states: Tensor,
+             weight: Tensor,
+             eps: float = 1e-6,
+             out: Tensor = None):
+    rms_norm_out = ext_ops.rms_norm(hidden_states, weight, eps)
+    if out is None:
+        out = rms_norm_out
+    else:
+        out.copy_(rms_norm_out)
+    return out
diff --git a/lmdeploy/pytorch/kernels/dipu/__init__.py b/lmdeploy/pytorch/kernels/dipu/__init__.py
deleted file mode 100644
index 65ebc8cec1..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..default import multinomial_sampling
-from .apply_rotary_pos_emb import apply_rotary_pos_emb
-from .fill_kv_cache import fill_kv_cache
-from .fused_rotary_emb import fused_rotary_emb
-from .pagedattention import paged_attention_fwd
-from .rms_norm import rms_norm
-
-__all__ = [
-    'rms_norm',
-    'apply_rotary_pos_emb',
-    'fused_rotary_emb',
-    'fill_kv_cache',
-    'paged_attention_fwd',
-    'multinomial_sampling',
-]
diff --git a/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py
deleted file mode 100644
index 559cf8afba..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import deeplink_ext.cpp_extensions as ext
-from torch import Tensor
-
-
-def apply_rotary_pos_emb(
-    query_states: Tensor,
-    key_states: Tensor,
-    cos: Tensor,
-    sin: Tensor,
-    position_ids: Tensor,
-    position_ids_1d: Tensor,
-    q_embed=None,
-    k_embed=None,
-    context=None,
-):
-    bs, head, dim = query_states.shape
-    numKeyValueHeads = key_states.shape[1]
-    query_states = query_states.reshape(bs, head * dim)
-    key_states = key_states.reshape(bs, numKeyValueHeads * dim)
-    if not (hasattr(context, 'cos') or hasattr(context, 'sin')):
-        if cos.dim() == 3:
-            cos = cos[:, position_ids_1d].view(1, bs, 1, -1)
-            sin = sin[:, position_ids_1d].view(1, bs, 1, -1)
-        elif cos.dim() == 2:
-            cos = cos[position_ids_1d].view(1, bs, 1, -1)
-            sin = sin[position_ids_1d].view(1, bs, 1, -1)
-        else:
-            raise RuntimeError(f'Unsupport cos dim: {cos.dim()}')
-        setattr(context, 'cos', cos)
-        setattr(context, 'sin', sin)
-    ext.rotary_embedding_v2(query_states, key_states, context.cos, context.sin,
-                            dim)
-    return query_states.view(bs, head,
-                             dim), key_states.view(bs, numKeyValueHeads, dim)
diff --git a/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py
deleted file mode 100644
index f51b851185..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import deeplink_ext.cpp_extensions as ext
-from torch import Tensor
-
-
-def fill_kv_cache(
-    key_states: Tensor,
-    value_states: Tensor,
-    key_caches: Tensor,
-    value_caches: Tensor,
-    q_start_loc: Tensor,
-    q_seq_length: Tensor,
-    kv_seq_length: Tensor,
-    max_q_seq_length: int,
-    block_offsets: Tensor,
-    context: None,
-):
-    """fill key/value state to cache for paged attention."""
-    dest_index_copy_kv(key_states, context.kv_start_indices, key_caches)
-    dest_index_copy_kv(value_states, context.kv_start_indices, value_caches)
-
-
-def dest_index_copy_kv(states, dest_loc, caches):
-    block_num, block_size, head, dim = caches.size()
-    caches_tmp = caches.view(block_num * block_size, head, dim)
-    ext.dest_index_copy_kv(states, dest_loc, caches_tmp)
-    caches[:] = caches_tmp.view(block_num, block_size, head, dim)
diff --git a/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py
deleted file mode 100644
index 2a67a24516..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import deeplink_ext.cpp_extensions as ext
-import torch
-from torch import Tensor
-
-
-def fused_rotary_emb(
-    query_states: Tensor,
-    key_states: Tensor,
-    position_ids: torch.LongTensor,
-    inv_freq: Tensor,
-    scaling_factor: float,
-    out_q: Tensor = None,
-    out_k: Tensor = None,
-    context=None,
-):
-    _, bs, head, dim = query_states.shape
-    _, _, numKeyValueHeads, _ = key_states.shape
-    query_states = query_states.view(bs, head * dim)
-    key_states = key_states.view(bs, numKeyValueHeads * dim)
-    position_ids = position_ids.squeeze(0).unsqueeze(-1)
-    pos_freq = position_ids / scaling_factor * inv_freq
-    if not (hasattr(context, 'cos') or hasattr(context, 'sin')):
-        cos = (torch.cos(pos_freq).view(position_ids.shape[0], 1,
-                                        -1).repeat(1, 1,
-                                                   2).to(query_states.dtype))
-        sin = (torch.sin(pos_freq).view(position_ids.shape[0], 1,
-                                        -1).repeat(1, 1,
-                                                   2).to(query_states.dtype))
-        setattr(context, 'cos', cos)
-        setattr(context, 'sin', sin)
-    ext.rotary_embedding_v2(query_states, key_states, context.cos, context.sin,
-                            dim)
-    query_states = query_states.view(1, bs, head, dim)
-    key_states = key_states.view(1, bs, numKeyValueHeads, dim)
-    return query_states, key_states
diff --git a/lmdeploy/pytorch/kernels/dipu/pagedattention.py b/lmdeploy/pytorch/kernels/dipu/pagedattention.py
deleted file mode 100644
index 9304ec0a35..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/pagedattention.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import deeplink_ext.cpp_extensions as ext
-import torch
-from torch import Tensor
-
-
-def flash_context_attention(
-    query_states: Tensor,
-    key_states: Tensor,
-    value_states: Tensor,
-    attn_output: Tensor,
-    key_cache: Tensor,
-    value_cache: Tensor,
-    block_offsets: Tensor,
-    q_start_loc: Tensor,
-    q_seqlens: list,
-    kv_seqlens: list,
-    block_size: int,
-    kv_cache_len: int,
-    context=None,
-):
-    batch, head, dim = (
-        q_start_loc.shape[0],
-        query_states.shape[1],
-        query_states.shape[2],
-    )
-    numKeyValueHeads = value_states.shape[1]
-    assert key_states.shape[1] == value_states.shape[1]
-    for i in range(batch):
-        start = q_start_loc[i]
-        end = start + q_seqlens[i]
-        single_seqlen = int(end - start)
-        single_q = query_states[start:end].view(1, single_seqlen, -1)
-        single_k = key_states[start:end].reshape(1, single_seqlen, -1)
-        single_v = value_states[start:end].reshape(1, single_seqlen, -1)
-        single_out = attn_output[start:end, :].view(1, single_seqlen, -1)
-        mask = context.attention_mask[i]
-        if q_seqlens[i] == kv_seqlens[i]:
-            ext.prompt_flash_attention(
-                single_out,
-                single_q,
-                single_k,
-                single_v,
-                mask,
-                [kv_seqlens[i]],
-                kv_seqlens[i],
-                head,
-                numKeyValueHeads,
-                dim,
-            )
-        else:
-            key_cache = key_cache.reshape(1, kv_cache_len,
-                                          numKeyValueHeads * dim)
-            value_cache = value_cache.reshape(1, kv_cache_len,
-                                              numKeyValueHeads * dim)
-            for j in range(q_seqlens[i]):
-                single_q = query_states[start + j:start + j + 1].view(1, 1, -1)
-                single_out = attn_output[start + j:start + j + 1].view(
-                    1, 1, -1)
-                ext.paged_attention(
-                    single_out,
-                    single_q,
-                    key_cache,
-                    value_cache,
-                    mask[j:j + 1],
-                    [kv_seqlens[i]],
-                    head,
-                    numKeyValueHeads,
-                    dim,
-                    block_offsets[i:i + 1],
-                    block_size,
-                )
-
-
-def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seqlens,
-                          block_table, block_size):
-    numKeyValueHeads = k_cache.shape[1]
-    assert k_cache.shape[1] == v_cache.shape[1]
-    bs, head, dim = q.shape
-    kv_cache_len = k_cache.shape[0]
-    q = q.reshape(bs, 1, head * dim)
-    k_cache = k_cache.reshape(1, kv_cache_len, numKeyValueHeads * dim)
-    v_cache = v_cache.reshape(1, kv_cache_len, numKeyValueHeads * dim)
-    ext.paged_attention(
-        attn_output.view(q.shape),
-        q,
-        k_cache,
-        v_cache,
-        None,
-        kv_seqlens,
-        head,
-        numKeyValueHeads,
-        dim,
-        block_table,
-        block_size,
-    )
-
-
-def paged_attention_fwd(
-    query_states: Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    key_cache: Tensor,
-    value_cache: Tensor,
-    attn_output: Tensor,
-    block_offsets: Tensor,
-    q_start_loc: Tensor,
-    q_seqlens: Tensor,
-    kv_seqlens: Tensor,
-    max_seqlen: int,
-    window_size: int = 1,
-    context=None,
-):
-    is_decoding = query_states.shape[-3] == q_seqlens.size(0)
-    block_num, block_size, head, dim = key_cache.size()
-    kv_cache_len = block_num * block_size
-    k = key_cache.reshape(block_num * block_size, head, dim)
-    v = value_cache.reshape(block_num * block_size, head, dim)
-    if not is_decoding:
-        flash_context_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_output,
-            k,
-            v,
-            block_offsets.to(torch.int32),
-            q_start_loc,
-            q_seqlens.tolist(),
-            kv_seqlens.tolist(),
-            block_size,
-            kv_cache_len,
-            context=context,
-        )
-    else:
-        paged_token_attention(
-            query_states,
-            k,
-            v,
-            attn_output,
-            kv_seqlens.tolist(),
-            block_offsets.to(torch.int32),
-            block_size,
-        )
diff --git a/lmdeploy/pytorch/kernels/dipu/rms_norm.py b/lmdeploy/pytorch/kernels/dipu/rms_norm.py
deleted file mode 100644
index 8dbcf91ca2..0000000000
--- a/lmdeploy/pytorch/kernels/dipu/rms_norm.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import deeplink_ext.cpp_extensions as ext
-import torch
-from torch import Tensor
-
-
-def rms_norm(hidden_states: Tensor, weight: Tensor, eps: float = 1e-6):
-    output = torch.empty_like(hidden_states)
-    inv_rms_shape = list(hidden_states.shape[:-1]) + [1]
-    inv_rms = torch.empty(inv_rms_shape,
-                          dtype=torch.float32,
-                          device=hidden_states.device)
-    ext.rms_norm(output, inv_rms, hidden_states, weight.shape, weight, None,
-                 eps)
-    return output
diff --git a/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py
new file mode 100644
index 0000000000..b8a55d4225
--- /dev/null
+++ b/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dispatcher import FunctionDispatcher
+
+moe_gating_topk_softmax = FunctionDispatcher(
+    'moe_gating_topk_softmax').make_caller()
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index c0aa9cf61d..0bb6cf6c40 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -7,7 +7,7 @@
 import numpy as np
 from torch import Tensor
 
-from lmdeploy.messages import EngineGenerationConfig, LogitsProcessor
+from lmdeploy.messages import GenerationConfig, LogitsProcessor
 from lmdeploy.utils import get_logger
 
 from .block import LogicalTokenBlocks
@@ -46,15 +46,16 @@ class SamplingParam:
     bad_words: List[int] = field(default_factory=list)
     max_new_tokens: int = 512
     min_new_tokens: int = 0
+    response_format: Optional[str] = None
     logits_processors: Optional[List[LogitsProcessor]] = None
 
     @classmethod
-    def from_gen_config(self, gen_config: EngineGenerationConfig):
+    def from_gen_config(self, gen_config: GenerationConfig):
         """from gen config."""
         min_new_tokens = gen_config.min_new_tokens or 0
 
-        stop_words = gen_config.stop_words or []
-        bad_words = gen_config.bad_words or []
+        stop_words = gen_config.stop_token_ids or []
+        bad_words = gen_config.bad_token_ids or []
         if gen_config.ignore_eos:
             bad_words += stop_words
             stop_words = []
@@ -64,6 +65,7 @@ def from_gen_config(self, gen_config: EngineGenerationConfig):
         temperature = gen_config.temperature
         repetition_penalty = gen_config.repetition_penalty
         max_new_tokens = gen_config.max_new_tokens
+        response_format = gen_config.response_format
 
         if top_p < 0 or top_p > 1.0:
             logger.warning('`top_p` has to be a float > 0 and < 1'
@@ -97,6 +99,7 @@ def from_gen_config(self, gen_config: EngineGenerationConfig):
                              random_seed=gen_config.random_seed,
                              stop_words=stop_words,
                              bad_words=bad_words,
+                             response_format=response_format,
                              max_new_tokens=max_new_tokens,
                              min_new_tokens=min_new_tokens,
                              logits_processors=gen_config.logits_processors)
diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py
index 1a7c319522..a8c01e45ee 100644
--- a/lmdeploy/pytorch/models/baichuan.py
+++ b/lmdeploy/pytorch/models/baichuan.py
@@ -67,6 +67,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """Rewrite of Attention.forward."""
@@ -186,6 +187,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """Rewrite of BaichuanAttention.forward."""
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index d472e01947..d2009d30b6 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -213,15 +213,14 @@ def _contiguous_batching_forward(
 
         return output, kv_cache
 
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        rotary_pos_emb,
-        kv_cache=None,
-        use_cache=True,
-        output_attentions=False,
-    ):
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                rotary_pos_emb,
+                kv_cache=None,
+                use_cache=True,
+                output_attentions=False,
+                **kwargs):
         return self._contiguous_batching_forward(
             hidden_states,
             rotary_pos_emb,
diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py
index 7bf0468064..331c689682 100644
--- a/lmdeploy/pytorch/models/deepseek.py
+++ b/lmdeploy/pytorch/models/deepseek.py
@@ -140,6 +140,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """forward."""
diff --git a/lmdeploy/pytorch/models/falcon.py b/lmdeploy/pytorch/models/falcon.py
index 8f5ea9a6b1..d68a0c32e7 100644
--- a/lmdeploy/pytorch/models/falcon.py
+++ b/lmdeploy/pytorch/models/falcon.py
@@ -215,16 +215,15 @@ def __rotary_emb_fn(query_states, key_states, value_states):
         else:
             return output_tensor, layer_past
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                alibi: Optional[torch.Tensor],
+                attention_mask: torch.Tensor,
+                layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                head_mask: Optional[torch.Tensor] = None,
+                use_cache: bool = False,
+                output_attentions: bool = False,
+                **kwargs):
         return self._contiguous_batching_forward(hidden_states, alibi,
                                                  layer_past)
 
diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py
index 54def0159e..4e4e140370 100644
--- a/lmdeploy/pytorch/models/internlm.py
+++ b/lmdeploy/pytorch/models/internlm.py
@@ -123,6 +123,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """forward."""
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
index 646b002435..5a4329d690 100644
--- a/lmdeploy/pytorch/models/phi3.py
+++ b/lmdeploy/pytorch/models/phi3.py
@@ -152,6 +152,7 @@ def forward(
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """rewrite of forward."""
diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py
index 7a1e9150a1..1a2d9d7488 100644
--- a/lmdeploy/pytorch/models/starcoder2.py
+++ b/lmdeploy/pytorch/models/starcoder2.py
@@ -170,6 +170,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """forward."""
diff --git a/lmdeploy/pytorch/passkey_retrieval.py b/lmdeploy/pytorch/passkey_retrieval.py
index 429ee1a423..460fa70317 100644
--- a/lmdeploy/pytorch/passkey_retrieval.py
+++ b/lmdeploy/pytorch/passkey_retrieval.py
@@ -3,7 +3,7 @@
 import os
 import random
 
-from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
+from lmdeploy.messages import GenerationConfig, PytorchEngineConfig
 from lmdeploy.model import MODELS
 from lmdeploy.tokenizer import Tokenizer
 
@@ -33,7 +33,7 @@ def __init__(self,
         self.generator = self.tm_model.create_instance()
         self.model = MODELS.get(model_name)()
         seed = random.getrandbits(64)
-        self.gen_config = EngineGenerationConfig(
+        self.gen_config = GenerationConfig(
             max_new_tokens=32,
             top_k=40,
             top_p=0.8,
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 47ff4e083e..93950c85cc 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -5,13 +5,13 @@
 import os
 import random
 from contextlib import asynccontextmanager
+from copy import deepcopy
 from itertools import count
 from queue import Empty, Queue
 from threading import Thread
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
-from lmdeploy.messages import (EngineGenerationConfig, GenerationConfig,
-                               PytorchEngineConfig, Response,
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, Response,
                                TurbomindEngineConfig)
 from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
 from lmdeploy.serve.utils import LogitsMixin, _get_event_loop
@@ -23,18 +23,17 @@
 
 def get_names_from_model(model_path: str, model_name: str = None):
     """Get model name and chat template name from workspace model."""
-    from configparser import ConfigParser
     triton_model_path = os.path.join(model_path, 'triton_models', 'weights')
     if not os.path.exists(triton_model_path):
         chat_template_name = best_match_model(model_path)
     else:
         # `model_path` refers to a turbomind model, reading
         # chat_template_name from the config
-        ini_path = os.path.join(triton_model_path, 'config.ini')
-        with open(ini_path, 'r') as f:
-            parser = ConfigParser()
-            parser.read_file(f)
-        chat_template_name = parser['llama']['chat_template']
+        config_path = os.path.join(triton_model_path, 'config.yaml')
+        with open(config_path, 'r') as f:
+            import yaml
+            config = yaml.safe_load(f)
+        chat_template_name = config['model_config']['chat_template']
     model_name = model_name if model_name else model_path
     return model_name, chat_template_name
 
@@ -290,17 +289,15 @@ async def get_generator(self, stop: bool, session_id: int):
         self.running_session_ids.add(session_id)
         return generator
 
-    def batch_infer(
-            self,
-            prompts: Union[List[str], str, List[Dict], List[List[Dict]]],
-            gen_config: Optional[Union[GenerationConfig,
-                                       List[GenerationConfig],
-                                       EngineGenerationConfig,
-                                       List[EngineGenerationConfig]]] = None,
-            do_preprocess: bool = True,
-            adapter_name: Optional[str] = None,
-            use_tqdm: bool = False,
-            **kwargs):
+    def batch_infer(self,
+                    prompts: Union[List[str], str, List[Dict],
+                                   List[List[Dict]]],
+                    gen_config: Optional[Union[GenerationConfig,
+                                               List[GenerationConfig]]] = None,
+                    do_preprocess: bool = True,
+                    adapter_name: Optional[str] = None,
+                    use_tqdm: bool = False,
+                    **kwargs):
         """Inference a batch of prompts.
 
         Args:
@@ -321,13 +318,10 @@ def batch_infer(
         assert isinstance(prompts, List), 'prompts should be a list'
         if gen_config is None:
             gen_config = GenerationConfig()
-        # set random if it is not set
-        if not isinstance(gen_config, List) and gen_config.random_seed is None:
-            gen_config.random_seed = random.getrandbits(64)
         if not isinstance(gen_config, List):
             gen_config = [gen_config] * len(prompts)
-        assert len(prompts) == len(gen_config),\
-                'input gen_confg length differs from the length of prompts' # noqa
+        assert len(prompts) == len(gen_config), \
+                'input gen_confg length differs from the length of prompts'  # noqa
         prompt_num = len(prompts)
         session_ids = [next(self._session_id) for _ in range(prompt_num)]
         outputs = [
@@ -377,9 +371,7 @@ def stream_infer(
             self,
             prompts: Union[List[str], str, List[Dict], List[List[Dict]]],
             gen_config: Optional[Union[GenerationConfig,
-                                       List[GenerationConfig],
-                                       EngineGenerationConfig,
-                                       List[EngineGenerationConfig]]] = None,
+                                       List[GenerationConfig]]] = None,
             do_preprocess: bool = True,
             adapter_name: Optional[str] = None,
             **kwargs):
@@ -402,13 +394,10 @@ def stream_infer(
         assert isinstance(prompts, List), 'prompts should be a list'
         if gen_config is None:
             gen_config = GenerationConfig()
-        # set random if it is not set
-        if not isinstance(gen_config, List) and gen_config.random_seed is None:
-            gen_config.random_seed = random.getrandbits(64)
         if not isinstance(gen_config, List):
             gen_config = [gen_config] * len(prompts)
-        assert len(prompts) == len(gen_config),\
-                'input gen_confg length differs from the length of prompts' # noqa
+        assert len(prompts) == len(gen_config), \
+                'input gen_confg length differs from the length of prompts'  # noqa
         session_ids = [next(self._session_id) for _ in range(len(prompts))]
         outputs = Queue()
         generators = []
@@ -478,8 +467,7 @@ async def generate(
             self,
             messages,
             session_id: int,
-            gen_config: Optional[Union[GenerationConfig,
-                                       EngineGenerationConfig]] = None,
+            gen_config: Optional[GenerationConfig] = None,
             tools: Optional[List[object]] = None,
             stream_response: bool = True,
             sequence_start: bool = True,
@@ -508,11 +496,17 @@ async def generate(
             self.id2step[str(session_id)] = step
         if gen_config is None:
             gen_config = GenerationConfig()
-        if type(gen_config) is GenerationConfig:
-            gen_config = EngineGenerationConfig.From(gen_config,
-                                                     self.tokenizer)
-        if gen_config.stop_words is None:
-            gen_config.stop_words = self.stop_words
+        else:
+            gen_config = deepcopy(gen_config)
+        gen_config.convert_stop_bad_words_to_ids(self.tokenizer)
+        if gen_config.stop_token_ids is None:
+            gen_config.stop_token_ids = self.stop_words
+        if not gen_config.do_sample:
+            # greedy decode
+            gen_config.top_k = 1
+            # avoid unnecessary process
+            gen_config.temperature = 1.0
+            gen_config.repetition_penalty = 1.0
         # set random if it is not set and sequence_start is True
         if gen_config.random_seed is None and sequence_start:
             gen_config.random_seed = random.getrandbits(64)
@@ -641,8 +635,7 @@ def parse_tool_response(self, text, tools, **kwargs):
     def chat(self,
              prompt: str,
              session=None,
-             gen_config: Optional[Union[GenerationConfig,
-                                        EngineGenerationConfig]] = None,
+             gen_config: Optional[GenerationConfig] = None,
              do_preprocess: bool = True,
              **kwargs) -> Session:
         """Chat.
diff --git a/lmdeploy/serve/gradio/vl.py b/lmdeploy/serve/gradio/vl.py
index ebeb371492..3413d62405 100644
--- a/lmdeploy/serve/gradio/vl.py
+++ b/lmdeploy/serve/gradio/vl.py
@@ -8,7 +8,7 @@
 from packaging.version import Version, parse
 from PIL import Image
 
-from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
                                TurbomindEngineConfig)
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.pytorch.engine.request import _run_until_complete
@@ -128,11 +128,11 @@ def chat(chatbot, session, max_new_tokens, top_p, top_k, temperature):
                        ' Please restart the session by reset button.')
             yield chatbot, session, enable_btn, disable_btn, enable_btn
         else:
-            gen_config = EngineGenerationConfig(max_new_tokens=max_new_tokens,
-                                                top_p=top_p,
-                                                top_k=top_k,
-                                                temperature=temperature,
-                                                stop_words=engine.stop_words)
+            gen_config = GenerationConfig(max_new_tokens=max_new_tokens,
+                                          top_p=top_p,
+                                          top_k=top_k,
+                                          temperature=temperature,
+                                          stop_token_ids=engine.stop_words)
             step = session.step
             state = DetokenizeState(len(input_ids))
             for outputs in generator.stream_infer(
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index c434faf86f..34a134973b 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -298,6 +298,11 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         1.0 means no penalty
     - stop (str | List[str] | None): To stop generating further
         tokens. Only accept stop words that's encoded to one token idex.
+    - response_format (Dict | None): Only pytorch backend support formatting
+        response. Examples: `{"type": "json_schema", "json_schema": {"name":
+        "test","schema": {"properties": {"name": {"type": "string"}},
+        "required": ["name"], "type": "object"}}}`
+        or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
     - logit_bias (Dict): Bias to logits. Only supported in pytorch engine.
     - tools (List): A list of tools the model may call. Currently, only
         internlm2 functions are supported as a tool. Use this to specify a
@@ -345,6 +350,13 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     gen_logprobs, logits_processors = None, None
     if request.logprobs and request.top_logprobs:
         gen_logprobs = request.top_logprobs
+    response_format = None
+    if request.response_format and request.response_format.type != 'text':
+        if VariableInterface.async_engine.backend != 'pytorch':
+            return create_error_response(
+                HTTPStatus.BAD_REQUEST,
+                'only pytorch backend can use response_format now')
+        response_format = request.response_format.model_dump()
 
     if request.logit_bias is not None:
         try:
@@ -360,6 +372,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
 
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens,
+        do_sample=True,
         logprobs=gen_logprobs,
         top_k=request.top_k,
         top_p=request.top_p,
@@ -368,6 +381,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         ignore_eos=request.ignore_eos,
         stop_words=request.stop,
         skip_special_tokens=request.skip_special_tokens,
+        response_format=response_format,
         logits_processors=logits_processors,
         random_seed=random_seed)
 
@@ -590,6 +604,7 @@ async def completions_v1(request: CompletionRequest,
 
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens if request.max_tokens else 512,
+        do_sample=True,
         logprobs=request.logprobs,
         top_k=request.top_k,
         top_p=request.top_p,
@@ -675,7 +690,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
 
     # Non-streaming response
     usage = UsageInfo()
-    choices = []
+    choices = [None] * len(generators)
 
     async def _inner_call(i, generator):
         final_logprobs = []
@@ -704,12 +719,12 @@ async def _inner_call(i, generator):
 
         assert final_res is not None
         choice_data = CompletionResponseChoice(
-            index=0,
+            index=i,
             text=text,
             finish_reason=final_res.finish_reason,
             logprobs=logprobs,
         )
-        choices.append(choice_data)
+        choices[i] = choice_data
 
         total_tokens = sum([
             final_res.history_token_len, final_res.input_token_len,
@@ -841,6 +856,7 @@ async def chat_interactive_v1(request: GenerateRequest,
 
     gen_config = GenerationConfig(
         max_new_tokens=request.request_output_len,
+        do_sample=True,
         top_p=request.top_p,
         top_k=request.top_k,
         temperature=request.temperature,
@@ -963,7 +979,7 @@ def serve(model_path: str,
         api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
             a single api_key. Default to None, which means no api key applied.
         ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
-    """ # noqa E501
+    """  # noqa E501
     if os.getenv('TM_LOG_LEVEL') is None:
         os.environ['TM_LOG_LEVEL'] = log_level
     logger.setLevel(log_level)
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 48c46cae25..bd54028c39 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -87,6 +87,25 @@ class StreamOptions(BaseModel):
     include_usage: Optional[bool] = False
 
 
+class JsonSchema(BaseModel):
+    name: str
+    # description is not used since it depends on model
+    description: Optional[str] = None
+    # use alias since pydantic does not support the OpenAI key `schema`
+    json_schema: Optional[Dict[str, Any]] = Field(default=None,
+                                                  alias='schema',
+                                                  examples=[None])
+    # strict is not used
+    strict: Optional[bool] = False
+
+
+class ResponseFormat(BaseModel):
+    # regex_schema is extended by lmdeploy to support regex output
+    type: Literal['text', 'json_object', 'json_schema', 'regex_schema']
+    json_schema: Optional[JsonSchema] = None
+    regex_schema: Optional[str] = None
+
+
 class ChatCompletionRequest(BaseModel):
     """Chat completion request."""
     model: str
@@ -99,7 +118,7 @@ class ChatCompletionRequest(BaseModel):
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = None
     n: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = None
+    logit_bias: Optional[Dict[str, float]] = Field(default=None, examples=[None])  # noqa
     max_tokens: Optional[int] = Field(default=None, examples=[None])
     stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None])  # noqa
     # yapf: enable
@@ -109,6 +128,8 @@ class ChatCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
+    response_format: Optional[ResponseFormat] = Field(default=None,
+                                                      examples=[None])  # noqa
     # additional argument of lmdeploy
     repetition_penalty: Optional[float] = 1.0
     session_id: Optional[int] = -1
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
index f8e707e5c6..e9d9115de0 100644
--- a/lmdeploy/serve/vl_async_engine.py
+++ b/lmdeploy/serve/vl_async_engine.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 
+from lmdeploy.pytorch.check_env import try_import_deeplink
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.constants import IMAGE_DUMMY_TOKEN_INDEX, IMAGE_TOKEN
@@ -18,6 +19,8 @@ class VLAsyncEngine(AsyncEngine):
     def __init__(self, model_path: str, **kwargs) -> None:
         vision_config = kwargs.pop('vision_config', None)
         backend_config = kwargs.get('backend_config', None)
+        if kwargs.get('backend', '') == 'pytorch':
+            try_import_deeplink(backend_config.device_type)
         self.vl_encoder = ImageEncoder(model_path,
                                        vision_config,
                                        backend_config=backend_config)
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index ba488b77a4..ade7875ce1 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -3,7 +3,7 @@
 import random
 
 from lmdeploy.archs import get_model_arch
-from lmdeploy.messages import EngineGenerationConfig, TurbomindEngineConfig
+from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.serve.async_engine import get_names_from_model
 from lmdeploy.tokenizer import DetokenizeState
@@ -70,7 +70,7 @@ def main(model_path: str,
         request_output_len (int): output token nums
         chat_template_config (ChatTemplateConfig): chat template config
         kwargs (dict): unused args
-    """ # noqa: E 501
+    """  # noqa: E 501
 
     # chat template
     _, chat_template_name = get_names_from_model(model_path)
@@ -110,12 +110,12 @@ def main(model_path: str,
     if stop_words is not None:
         stop_words = stop_words[0][0].tolist()
 
-    gen_config = EngineGenerationConfig(max_new_tokens=request_output_len,
-                                        top_k=top_k,
-                                        top_p=top_p,
-                                        temperature=temperature,
-                                        repetition_penalty=repetition_penalty,
-                                        stop_words=stop_words)
+    gen_config = GenerationConfig(max_new_tokens=request_output_len,
+                                  top_k=top_k,
+                                  top_p=top_p,
+                                  temperature=temperature,
+                                  repetition_penalty=repetition_penalty,
+                                  stop_token_ids=stop_words)
 
     nth_round = 1
     step = 0
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
new file mode 100644
index 0000000000..bec6120b7b
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import json
+from dataclasses import asdict, fields
+
+# use pydantic.dataclasses.dataclass to check data type
+from pydantic.dataclasses import dataclass
+
+from lmdeploy.messages import TurbomindEngineConfig
+
+
+def config_from_dict(cls, env):
+    """initiate an instance of a config class from a dict."""
+    params = inspect.signature(cls).parameters
+    used = {k: v for k, v in env.items() if k in params and v is not None}
+    return cls(**used)
+
+
+def config_to_dict(config):
+    """export config to a dict."""
+    if not config:
+        return dict()
+    assert isinstance(config, (ModelConfig, AttentionConfig, LoraConfig)), \
+        f'A dataclass is expected, but got {type(config)}'
+
+    return asdict(config)
+
+
+@dataclass
+class ModelConfig:
+    model_name: str = ''
+    chat_template: str = ''
+    model_arch: str = None
+    head_num: int = None
+    kv_head_num: int = None
+    hidden_units: int = None
+    vocab_size: int = None
+    num_layer: int = None
+    inter_size: int = None
+    norm_eps: float = None
+    attn_bias: int = None
+    start_id: int = None
+    end_id: int = None
+    size_per_head: int = 128
+    group_size: int = 0
+    weight_type: str = None
+    session_len: int = None
+    tp: int = 1
+    model_format: str = 'hf'
+
+
+@dataclass
+class AttentionConfig:
+    rotary_embedding: int = 128
+    rope_theta: float = 10000.0
+    max_position_embeddings: int = 0
+    original_max_position_embeddings: int = 0
+    rope_scaling_type: str = ''
+    rope_scaling_factor: float = 0.0
+    use_dynamic_ntk: int = 0
+    low_freq_factor: float = 1.0
+    high_freq_factor: float = 1.0
+    use_logn_attn: int = 0
+    cache_block_seq_len: int = 64
+
+
+@dataclass
+class LoraConfig:
+    lora_policy: str = ''
+    lora_r: int = 0
+    lora_scale: float = 0.0
+    lora_max_wo_r: int = 0
+    lora_rank_pattern: str = ''
+    lora_scale_pattern: str = ''
+
+
+@dataclass
+class TurbomindModelConfig:
+    """Config for turbomind model."""
+    model_config: ModelConfig = None
+    attention_config: AttentionConfig = None
+    lora_config: LoraConfig = None
+
+    def update_from_engine_config(self, config: TurbomindEngineConfig):
+        """Update the attributes of this instance with the attributes from
+        TurbomindEngineConfig.
+
+        Args:
+            config (TurbomindEngineConfig): The turbomind engine config
+        """
+        if config is None:
+            return
+        for key, value in asdict(config).items():
+            if not value:
+                continue
+
+            if hasattr(self.model_config, key):
+                setattr(self.model_config, key, value)
+            if hasattr(self.attention_config, key):
+                setattr(self.attention_config, key, value)
+
+    @classmethod
+    def from_dict(cls, config: dict = {}):
+        """construct TurbomindModelConfig instance from config in a dict."""
+        _cfg = {
+            field.name: config.get(field.name, {})
+            for field in fields(TurbomindModelConfig)
+        }
+
+        return TurbomindModelConfig(
+            model_config=config_from_dict(ModelConfig, _cfg['model_config']),
+            attention_config=config_from_dict(AttentionConfig,
+                                              _cfg['attention_config']),
+            lora_config=config_from_dict(LoraConfig, _cfg['lora_config']))
+
+    def to_dict(self):
+        """export to a dict."""
+        return dict(model_config=config_to_dict(self.model_config),
+                    attention_config=config_to_dict(self.attention_config),
+                    lora_config=config_to_dict(self.lora_config))
+
+    @property
+    def session_len(self):
+        return self.model_config.session_len
+
+    @property
+    def tensor_para_size(self):
+        return self.model_config.tp
+
+    @property
+    def weight_type(self):
+        return self.model_config.weight_type
+
+    @property
+    def group_size(self):
+        return self.model_config.group_size
+
+    def __str__(self):
+        return json.dumps(self.to_dict(), indent=2)
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 441b3cbe22..bce9bbd614 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -13,10 +13,11 @@
 
 from ...utils import _get_and_verify_max_len
 from ..supported_models import SUPPORTED_ARCHS, is_supported
+from .config import TurbomindModelConfig
 from .exporter import get_exporter_factory
 from .policy import get_input_policy
 from .source_model.base import INPUT_MODELS
-from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
+from .target_model.base import OUTPUT_MODELS
 
 SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', 'gptq', None]
 logger = get_logger('lmdeploy')
@@ -93,14 +94,14 @@ def get_output_model_registered_name_and_config(model_path: str,
     Args:
         model_path (str): the path of the input model
         model_format (str): the format of the model, which can be one of
-            ['meta_llama',  'hf', 'awq']
+            ['meta_llama',  'hf', 'awq', 'gptq']
         group_size (int): the size of group used by awq model
     """
     register_name = 'tm'
     turbomind_model_arch = 'llama'
     weight_type = 'fp16'
 
-    config = TurbomindModelConfig.from_dict({}, allow_none=True)
+    config = TurbomindModelConfig.from_dict()
 
     if model_format == 'meta_llama':
         session_len = 2048
@@ -124,10 +125,11 @@ def get_output_model_registered_name_and_config(model_path: str,
                     'Device does not support bfloat16. Set float16 forcefully')
                 weight_type = 'fp16'
 
-    config.model_arch = model_arch
-    config.session_len = session_len + 8
-    config.weight_type = weight_type
-    config.group_size = group_size
+    config.model_config.model_arch = model_arch
+    config.model_config.weight_type = weight_type
+    config.model_config.model_format = model_format
+    config.model_config.group_size = group_size
+    config.model_config.session_len = session_len
 
     lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else ''
 
@@ -181,7 +183,7 @@ def find_quantization_config(nested, target_key):
 def get_tm_model(model_path,
                  model_name,
                  chat_template_name,
-                 engine_config,
+                 engine_config: TurbomindEngineConfig,
                  group_size: int = None,
                  out_dir: str = None):
     """Create turbomind model.
@@ -215,9 +217,6 @@ def get_tm_model(model_path,
             f'mismatched quant group size: user input "{group_size}" ' \
             f'vs model quant_config "{_group_size}"'
 
-        engine_config.model_format = quant_method
-        group_size = _group_size
-
         if quant_method == 'awq':
             assert version == 'gemm', \
                 f'unsupported quant config: {quant_config}'
@@ -228,6 +227,9 @@ def get_tm_model(model_path,
         else:
             assert 0, f'unsupported quant_config: {quant_config}'
 
+        engine_config.model_format = quant_method
+        group_size = _group_size
+
     # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0)
     if not group_size:
         group_size = 128
@@ -245,38 +247,28 @@ def get_tm_model(model_path,
                                                      tokenizer_path=model_path,
                                                      input_policy=input_policy)
 
-    output_model_name, cfg, exporter_factory = \
+    output_model_name, tm_cfg, exporter_factory = \
         get_output_model_registered_name_and_config(
             model_path=model_path,
             model_format=engine_config.model_format,
             group_size=group_size)
 
-    cfg.chat_template = chat_template_name
-    cfg.model_name = model_name
-    cfg.tensor_para_size = engine_config.tp
+    tm_cfg.model_config.chat_template = chat_template_name
+    tm_cfg.model_config.model_name = model_name
+    tm_cfg.model_config.tp = engine_config.tp
 
     output_model = OUTPUT_MODELS.get(output_model_name)(
         input_model=input_model,
-        cfg=cfg,
+        cfg=tm_cfg,
         exporter_factory=exporter_factory,
         out_dir=out_dir)
-    if engine_config.rope_scaling_factor == 0:
-        # to avoid `rope_scaling_factor` from engine_config override
-        # the rope_scaling_factor in TurbomindModelConfig
-        engine_config.rope_scaling_factor = None
-    output_model.cfg.update_from_engine_config(engine_config)
-    # cast bool to int, otherwise, the bool variables will be saved to
-    # config.ini as string
-    # TODO(lvhan): change config.ini to config.yaml
-    output_model.cfg.enable_prefix_caching = int(
-        output_model.cfg.enable_prefix_caching)
-    output_model.cfg.use_logn_attn = int(output_model.cfg.use_logn_attn)
+
     return output_model
 
 
 def main(model_name: str,
          model_path: str,
-         model_format: str = None,
+         model_format: str = 'hf',
          chat_template: str = None,
          tokenizer_path: str = None,
          dst_path: str = 'workspace',
@@ -291,10 +283,10 @@ def main(model_name: str,
         model_name (str): unused any longer
         model_path (str): the directory path of the model
         model_format (str): the format of the model, should choose from
-            ['meta_llama', 'hf', 'awq', None]. 'meta_llama' stands for META's
-            llama format, 'hf' means huggingface llama format, and 'awq' means
-            llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
-            The default value is None
+            ['meta_llama', 'hf', 'awq', 'gptq']. 'meta_llama' stands for META's
+            llama format, 'hf' means huggingface model, and 'awq', `gptq`
+            means models quantized by `autoawq` and `autogptq` respectively.
+            The default value is hf
         chat_template (str): the name of the built-in chat template.
         tokenizer_path (str): the path of tokenizer model
         dst_path (str): the destination path that saves outputs
diff --git a/lmdeploy/turbomind/deploy/exporter.py b/lmdeploy/turbomind/deploy/exporter.py
index 48f9312fa6..9667d34583 100644
--- a/lmdeploy/turbomind/deploy/exporter.py
+++ b/lmdeploy/turbomind/deploy/exporter.py
@@ -74,9 +74,9 @@ class BaseExporter(ABC):
 
     def __init__(self, model: BaseOutputModel):
         self.model = model
-        self.tp = model.cfg.tensor_para_size
-        self.head_dim = model.cfg.size_per_head
-        self.inter_size = model.cfg.inter_size
+        self.tp = model.tensor_para_size
+        self.head_dim = model.model_config.size_per_head
+        self.inter_size = model.model_config.inter_size
 
     def export_attn(self, idx: int, qkvo, kind: str, pack_fn=identity):
         if all(x is None for x in qkvo):
@@ -156,7 +156,7 @@ class QuantWeightExporter(BaseExporter):
     def __init__(self, model: BaseOutputModel, pack_fn):
         super().__init__(model)
         self.pack_fn = pack_fn
-        self.group_size = model.cfg.group_size
+        self.group_size = model.tm_config.group_size
 
     def export(self, r: BaseReader, i: int):
 
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
index 87983b2551..6b839876fe 100644
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -1,20 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import configparser
-import copy
-import inspect
-import io
-import json
 import os.path as osp
 from abc import ABC, abstractmethod
-from configparser import ConfigParser
 
 import torch
 import tqdm
+import yaml
 from mmengine import Registry
-from pydantic.dataclasses import dataclass
-
-from lmdeploy.messages import TurbomindEngineConfig
 
+from ..config import (AttentionConfig, LoraConfig, ModelConfig,
+                      TurbomindModelConfig, config_from_dict, config_to_dict)
 from ..source_model.base import BaseInputModel, BaseReader
 
 OUTPUT_MODELS = Registry(
@@ -31,122 +25,8 @@ def tprint(*args, **kwargs):
     tqdm.tqdm.write(s.getvalue())
 
 
-@dataclass
-class TurbomindModelConfig:
-    """Config for turbomind model."""
-
-    model_name: str = ''
-    chat_template: str = ''
-    model_arch: str = None
-    tensor_para_size: int = None
-    head_num: int = None
-    kv_head_num: int = None
-    hidden_units: int = None
-    vocab_size: int = None
-    num_layer: int = None
-    inter_size: int = None
-    norm_eps: float = None
-    attn_bias: int = None
-    start_id: int = None
-    end_id: int = None
-    session_len: int = None
-    weight_type: str = None
-    rotary_embedding: int = 128
-    rope_theta: float = 10000.0
-    size_per_head: int = 128
-    group_size: int = 0
-    max_batch_size: int = 64
-    max_prefill_token_num: int = 8192
-    max_context_token_num: int = 1
-    step_length: int = 1
-    cache_max_entry_count: float = 0.8
-    cache_block_seq_len: int = 64
-    cache_chunk_size: int = -1
-    enable_prefix_caching: bool = False
-    num_tokens_per_iter: int = 0
-    max_prefill_iters: int = 1
-    use_context_fmha: int = 1
-    quant_policy: int = 0
-    max_position_embeddings: int = 0
-    original_max_position_embeddings: int = 0
-    rope_scaling_type: str = ''
-    rope_scaling_factor: float = 0.0
-    use_dynamic_ntk: int = 0
-    low_freq_factor: float = 1.0
-    high_freq_factor: float = 1.0
-    use_logn_attn: int = 0
-    lora_policy: str = ''
-    lora_r: int = 0
-    lora_scale: float = 0.0
-    lora_max_wo_r: int = 0
-    lora_rank_pattern: str = ''
-    lora_scale_pattern: str = ''
-
-    @classmethod
-    def from_dict(cls, env, allow_none=False):
-        """Construct from dict."""
-        params = inspect.signature(cls).parameters
-        used = {k: v for k, v in env.items() if k in params and v is not None}
-        if not allow_none:
-            return cls(**used)
-        else:
-            default = {
-                k: None
-                for k in params.keys() if params[k].default is inspect._empty
-            }
-            default.update(used)
-            return cls(**default)
-
-    def update_from_engine_config(self, config: TurbomindEngineConfig):
-        """Update the attributes of this instance with the attributes from
-        TurbomindEngineConfig.
-
-        Args:
-            config (TurbomindEngineConfig): The turbomind engine config
-        """
-        if config is None:
-            return
-        # Iterate over the fields of 'self'
-        for field_name, _ in self.__dataclass_fields__.items():
-            # If the field value in 'other' is not None,
-            # update the corresponding field in 'self'
-            if hasattr(config, field_name) and getattr(config,
-                                                       field_name) is not None:
-                setattr(self, field_name, getattr(config, field_name))
-
-        self.tensor_para_size = config.tp
-        assert self.session_len is not None
-        if config.max_prefill_token_num is not None and \
-                config.num_tokens_per_iter == 0:
-            self.num_tokens_per_iter = config.max_prefill_token_num
-            self.max_prefill_iters = (self.session_len +
-                                      config.max_prefill_token_num -
-                                      1) // config.max_prefill_token_num
-
-    def toini(self):
-        config = copy.deepcopy(self.__dict__)
-        parser = ConfigParser()
-        parser['llama'] = config
-        with io.StringIO() as ss:
-            parser.write(ss)
-            ss.seek(0)
-            ini = ss.read()
-        return ini
-
-    def __str__(self):
-        return json.dumps(self.__dict__, indent=2)
-
-    @property
-    def valid(self):
-        """Check if cfg is valid."""
-        for _, v in self.__dict__.items():
-            if v is None:
-                return False
-        return True
-
-
 def _weight_dtype_map(weight_type: str, default=None):
-    """get weight dtype map."""
+    """map literal data type to torch dtype."""
 
     _WEIGHT_DTYPE_MAP = dict(
         int4=torch.float16,
@@ -169,47 +49,65 @@ def __init__(self,
                  out_dir: str = ''):
         super().__init__()
         self.input_model = input_model
-        self.cfg = cfg
-        if not cfg.valid:
-            self.cfg = self.get_config(cfg)
-        assert self.cfg.valid
-        assert self.cfg.kv_head_num % self.cfg.tensor_para_size == 0
+        self.model_config = cfg.model_config
+        self.attention_config = cfg.attention_config
+        self.lora_config = cfg.lora_config
+        self.tensor_para_size = self.model_config.tp
         self.out_dir = out_dir
         self.to_file = True if out_dir else False
         self.tm_params = {}
-        model_info = self.input_model.model_info()
-        self.permute_qk = model_info.get('permute_qk', True)
+
+        # get `model_info` and `tokenizer_info` at first, which
+        # will be updated to `self.model_config` and `self.attention_config`
+        self.input_model_info = self.input_model.model_info()
+        self.input_model_tokenizer_info = self.input_model.tokenizer_info()
+        self.permute_qk = self.input_model_info.get('permute_qk', True)
+
+        self.update_model_config()
+        assert self.model_config.kv_head_num % self.tensor_para_size == 0
+
+        self.update_attention_config()
+        self.update_lora_config()
         # ! Dependency on `self`
         self.exporters = exporter_factory(self)
 
     @abstractmethod
-    def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
-        """Generate turbomind model config (config.ini)."""
-        _, bos_id, eos_id = self.input_model.tokenizer_info()
+    def update_model_config(self):
+        """Update `self.model_config` according to the input_model's
+        `tokenizer_info` and `model_info`"""
+        _, bos_id, eos_id = self.input_model_tokenizer_info
 
-        final_cfg = cfg.__dict__
+        final_cfg = config_to_dict(self.model_config)
         final_cfg.update(dict(start_id=bos_id, end_id=eos_id))
-        final_cfg.update(self.input_model.model_info())
+        final_cfg.update(self.input_model_info)
 
-        # vocab_size
+        # get vocab_size
         for bin in self.input_model.bins():
             emb = bin.tok_embeddings()
             if emb is not None:
-                _vocab_size, dim = emb.shape
+                _vocab_size, _ = emb.shape
                 break
         final_cfg.update(dict(vocab_size=_vocab_size))
-        return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
+        self.model_config = config_from_dict(ModelConfig, final_cfg)
+
+    def update_attention_config(self):
+        """update attention config according to input model's model info."""
+        final_cfg = config_to_dict(self.attention_config)
+        final_cfg.update(self.input_model_info)
+        self.attention_config = config_from_dict(AttentionConfig, final_cfg)
+
+    def update_lora_config(self):
+        """update lora config according to input model's model info."""
+        final_cfg = config_to_dict(self.lora_config)
+        final_cfg.update(self.input_model_info)
+        self.lora_config = config_from_dict(LoraConfig, final_cfg)
 
     def export_config(self) -> None:
         """export turbomind config."""
         if self.to_file:
-            config = configparser.ConfigParser()
-            cfg = dict(llama=self.cfg.__dict__)
-            for section, key_values in cfg.items():
-                config[section] = key_values
-            config_path = osp.join(self.out_dir, 'config.ini')
+            config_path = osp.join(self.out_dir, 'config.yaml')
             with open(config_path, 'w') as f:
-                config.write(f)
+                yaml.safe_dump(self.tm_config.to_dict(), f)
 
     def export_weight(self, param: torch.Tensor, name: str) -> None:
         """export turbomind weight."""
@@ -222,14 +120,14 @@ def _tofile(tensor, path):
 
         if self.to_file:
             if torch.is_floating_point(param):
-                torch_type = _weight_dtype_map(self.cfg.weight_type,
+                torch_type = _weight_dtype_map(self.model_config.weight_type,
                                                torch.float16)
                 param = param.to(torch_type)
             tprint(name, param.shape)
             _tofile(param, osp.join(self.out_dir, name))
         elif len(self.tm_params) > 0:
             tm_params = self.tm_params
-            weight_type = self.cfg.weight_type
+            weight_type = self.model_config.weight_type
             assert weight_type in ['fp16', 'fp32', 'bf16', 'int4']
 
             # currently, the tensor type should in
@@ -269,7 +167,7 @@ def save_split(self,
             split_dim = None
             copy = True
 
-        tp = self.cfg.tensor_para_size
+        tp = self.tensor_para_size
         if split_dim is not None:
             tprint(
                 f'*** splitting {name}, shape={tensor.shape}, '
@@ -295,7 +193,7 @@ def save_split(self,
 
     def export(self) -> None:
         """Export to turbomind model format."""
-        num_layer = self.cfg.num_layer
+        num_layer = self.model_config.num_layer
         from tqdm import tqdm
         pbar = tqdm(total=num_layer,
                     desc='Convert to turbomind format',
@@ -321,8 +219,8 @@ def export_misc(self, bin: BaseReader) -> None:
 
         def pad_weight(tensor):
             pad_size = None
-            vocab_size = self.cfg.vocab_size
-            tp = self.cfg.tensor_para_size
+            vocab_size = self.model_config.vocab_size
+            tp = self.tensor_para_size
             if vocab_size % tp != 0:
                 pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
 
@@ -344,3 +242,9 @@ def export_transformer_block(self, bin: BaseReader, i: int) -> None:
         """Export transformer block."""
         for e in self.exporters:
             e.export(bin, i)
+
+    @property
+    def tm_config(self):
+        return TurbomindModelConfig(model_config=self.model_config,
+                                    attention_config=self.attention_config,
+                                    lora_config=self.lora_config)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
index 57c958fd36..14e1115b20 100644
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -1,16 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from .base import OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig
+from ..config import ModelConfig, config_from_dict, config_to_dict
+from .base import OUTPUT_MODELS, BaseOutputModel
 
 
 @OUTPUT_MODELS.register_module(name='tm')
 class TurbomindModel(BaseOutputModel):
     """Export to turbomind fp16 format."""
 
-    def get_config(self, cfg: TurbomindModelConfig):
-        """Get turbomind config."""
-        final_cfg = super().get_config(cfg).__dict__
-        # attn_bias, inter_size
+    def update_model_config(self):
+        """Update `self.model_config`.
+
+        Firstly, call `update_model_config` of the superclass. Then update
+        `inter_size` and `attn_bias` that are indicates from the input_model's
+        weight files
+        """
+        super().update_model_config()
+        final_cfg = config_to_dict(self.model_config)
+        # get attn_bias, inter_size
         visit = False
         attn_bias = 0
         for bin in self.input_model.bins():
@@ -24,13 +31,13 @@ def get_config(self, cfg: TurbomindModelConfig):
                 break
             if visit:
                 break
-        inter_size = self._pad_inter_size(inter_size, final_cfg)
+        inter_size = self._pad_inter_size(inter_size)
         final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
-        return TurbomindModelConfig.from_dict(final_cfg)
+        self.model_config = config_from_dict(ModelConfig, final_cfg)
 
-    def _pad_inter_size(self, inter_size: int, cfg: dict):
-        group_size = max(1, cfg['group_size'])
-        tp = cfg['tensor_para_size']
+    def _pad_inter_size(self, inter_size: int):
+        group_size = max(1, self.model_config.group_size)
+        tp = self.tensor_para_size
         groups_per_rank = (inter_size // group_size + tp - 1) // tp
         inter_size_padded = groups_per_rank * group_size * tp
         return inter_size_padded
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 75e70e669a..91b057d723 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -15,22 +15,19 @@ def get_llama_gemm():
     return bin_path
 
 
-def read_config(ini_path: str):
+def read_config(config_file: str):
     """read turbomind config from turbomind.
 
     Args:
-        ini_path (str): the path of `config.ini` file in turbomind model
+        config_file (str): the path of turbomind config file in turbomind model
     """
-    from configparser import ConfigParser
+
+    import yaml
 
     from lmdeploy.turbomind.deploy.target_model.base import \
         TurbomindModelConfig
-
-    with open(ini_path, 'r') as f:
-        parser = ConfigParser()
-        parser.read_file(f)
-    section_name = 'llama'
-    _cfg = parser._sections[section_name]
+    with open(config_file, 'r') as f:
+        _cfg = yaml.safe_load(f)
     cfg = TurbomindModelConfig.from_dict(_cfg)
     return cfg.head_num, cfg.size_per_head, cfg.inter_size, \
         cfg.vocab_size, cfg.tensor_para_size
@@ -52,7 +49,7 @@ def main(head_num: int = 32,
             head_num, size_per_head, inter_size, vocab_size, \
                 tensor_para_size = read_config(
                     osp.join(model_path,
-                             'triton_models', 'weights', 'config.ini'))
+                             'triton_models', 'weights', 'config.yaml'))
         else:
             from transformers import AutoConfig
             config = AutoConfig.from_pretrained(model_path,
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index e1ab172bf1..3b05e5717c 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -1,25 +1,26 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+import json
 import os.path as osp
 import sys
 from concurrent.futures import ThreadPoolExecutor
-from configparser import ConfigParser
+from dataclasses import asdict
 from itertools import repeat
 from queue import LifoQueue, Queue
 from typing import Dict, Iterable, List, Union
 
 import numpy as np
 import torch
+import yaml
 from torch.nn.utils.rnn import pad_sequence
 
 import lmdeploy
-from lmdeploy.messages import (EngineGenerationConfig, EngineOutput,
-                               ResponseType, TurbomindEngineConfig)
+from lmdeploy.messages import (EngineOutput, GenerationConfig, ResponseType,
+                               TurbomindEngineConfig)
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger, get_model
 
-from .deploy.converter import SUPPORTED_FORMATS, get_tm_model
-from .deploy.target_model.base import TurbomindModelConfig
+from .deploy.config import TurbomindModelConfig
 from .supported_models import is_supported
 from .utils import ModelSource, get_model_source
 
@@ -164,33 +165,53 @@ def _get_params(device_id, que):
                     tm_params[k] = []
                 tm_params[k].append(v)
 
+    def _postprocess_config(self, tm_config, engine_config):
+        """postprocess turbomind config by."""
+        import copy
+        self.config = copy.deepcopy(tm_config)
+        # Update the attribute values in `self.config` with the valid values
+        # from the corresponding attributes in `engine_config`, such as
+        # `session_len`, `quant_policy`, `rope_scaling_factor`, etc.
+        self.config.update_from_engine_config(engine_config)
+
+        # update some attributes of `engine_config` which depends on
+        # `session_len`
+        self.engine_config = engine_config
+        if engine_config.max_prefill_token_num is not None \
+                and engine_config.num_tokens_per_iter == 0:
+            self.engine_config.num_tokens_per_iter = \
+                engine_config.max_prefill_token_num
+            self.engine_config.max_prefill_iters = (
+                self.config.session_len + engine_config.max_prefill_token_num -
+                1) // engine_config.max_prefill_token_num
+
+        # pack `self.config` and `self.engine_config` into a dict
+        self.config_dict = self.config.to_dict()
+        self.config_dict.update(dict(engine_config=asdict(self.engine_config)))
+        logger.info(f'turbomind model config:\n\n'
+                    f'{json.dumps(self.config_dict, indent=2)}')
+
     def _from_hf(self, model_source: ModelSource, model_path: str,
                  engine_config: TurbomindEngineConfig):
         """Load model which is in hf format."""
         assert model_source == ModelSource.HF_MODEL, \
             f'{model_source} is not supported'
-        if engine_config is None:
-            logger.warning('input engine config is None, using the default')
-            engine_config = TurbomindEngineConfig()
-        assert engine_config.model_format in SUPPORTED_FORMATS, \
-            f'The model format should be in {SUPPORTED_FORMATS}'
-
         assert is_supported(model_path), (
             f'turbomind does not support {model_path}. '
             'Plz try pytorch engine instead.')
 
-        # convert transformers model into turbomind model format
+        # convert transformers model into turbomind model
+        from .deploy.converter import get_tm_model
         tm_model = get_tm_model(model_path, self.model_name,
                                 self.chat_template_name, engine_config)
 
-        self.config = tm_model.cfg
-        logger.info(f'model_config:\n\n{self.config.toini()}')
+        self._postprocess_config(tm_model.tm_config, engine_config)
 
         model_comm = _tm.AbstractTransformerModel.create_llama_model(
             model_dir='',
-            config=self.config.toini(),
+            config=yaml.safe_dump(self.config_dict),
             tensor_para_size=self.gpu_count,
-            data_type=self.config.weight_type)
+            data_type=self.config.model_config.weight_type)
 
         # create empty weight
         self._create_weight(model_comm)
@@ -212,42 +233,27 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
     def _from_workspace(self, model_path: str,
                         engine_config: TurbomindEngineConfig):
         """Load model which is converted by `lmdeploy convert`"""
-        ini_path = osp.join(model_path, 'triton_models', 'weights',
-                            'config.ini')
-        # load cfg
-        with open(ini_path, 'r') as f:
-            parser = ConfigParser()
-            parser.read_file(f)
-        section_name = 'llama'
-        _cfg = parser._sections[section_name]
+        config_path = osp.join(model_path, 'triton_models', 'weights',
+                               'config.yaml')
+        # load TurbomindModelConfig from config file
+        with open(config_path, 'r') as f:
+            _cfg = yaml.safe_load(f)
         cfg = TurbomindModelConfig.from_dict(_cfg)
 
         # check whether input tp is valid
+        self.gpu_count = engine_config.tp
         if cfg.tensor_para_size != 1 and \
                 self.gpu_count != cfg.tensor_para_size:
-            logger.info(f'found tp={cfg.tensor_para_size} in config.ini.')
+            logger.info(f'found tp={cfg.tensor_para_size} in config.yaml.')
             self.gpu_count = cfg.tensor_para_size
+        engine_config.tp = self.gpu_count
+
+        self._postprocess_config(cfg, engine_config)
 
-        if engine_config is not None:
-            engine_config.tp = cfg.tensor_para_size
-            if engine_config.rope_scaling_factor == 0:
-                # to avoid `rope_scaling_factor` from engine_config override
-                # the rope_scaling_factor in TurbomindModelConfig
-                engine_config.rope_scaling_factor = None
-            cfg.update_from_engine_config(engine_config)
-        if self.model_name:
-            cfg.model_name = self.model_name
-        if self.chat_template_name:
-            cfg.chat_template_name = self.chat_template_name
-        # update cfg
-        self.config = cfg
-
-        # create model
-        logger.warning(f'model_config:\n\n{cfg.toini()}')
         weight_dir = osp.join(model_path, 'triton_models', 'weights')
         model_comm = _tm.AbstractTransformerModel.create_llama_model(
             model_dir=weight_dir,
-            config=cfg.toini(),
+            config=yaml.safe_dump(self.config_dict),
             tensor_para_size=self.gpu_count,
             data_type=self.config.weight_type)
 
@@ -404,7 +410,7 @@ def end(self, session_id: int):
                 input_ids,
                 sequence_start=False,
                 sequence_end=True,
-                gen_config=EngineGenerationConfig(max_new_tokens=0)):
+                gen_config=GenerationConfig(max_new_tokens=0)):
             pass
 
     async def async_end(self, session_id: int):
@@ -421,7 +427,7 @@ def cancel(self, session_id: int):
                 sequence_start=False,
                 sequence_end=False,
                 stop=True,
-                gen_config=EngineGenerationConfig(max_new_tokens=0)):
+                gen_config=GenerationConfig(max_new_tokens=0)):
             pass
 
     async def async_cancel(self, session_id: int):
@@ -480,7 +486,7 @@ def prepare_embeddings(self,
     def prepare_inputs(self,
                        session_id,
                        input_ids,
-                       gen_config: EngineGenerationConfig,
+                       gen_config: GenerationConfig,
                        input_embeddings=None,
                        input_embedding_ranges=None,
                        sequence_start: bool = True,
@@ -551,13 +557,13 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
             inputs['logprobs'] = _broadcast_np(gen_config.logprobs, np.int32)
 
         bad_words = []
-        if gen_config.bad_words is not None:
-            bad_words.extend(gen_config.bad_words)
+        if gen_config.bad_token_ids is not None:
+            bad_words.extend(gen_config.bad_token_ids)
         if gen_config.ignore_eos:
             stop_words = None
             bad_words.append(self.eos_id)
         else:
-            stop_words = gen_config.stop_words
+            stop_words = gen_config.stop_token_ids
         stop_words = _construct_stop_or_bad_words(stop_words)
         bad_words = _construct_stop_or_bad_words(bad_words)
 
@@ -580,7 +586,7 @@ async def async_stream_infer(self,
                                  sequence_end: bool = False,
                                  step=0,
                                  stop=False,
-                                 gen_config: EngineGenerationConfig = None,
+                                 gen_config: GenerationConfig = None,
                                  stream_output=False,
                                  **kwargs):
         """Perform model inference.
@@ -595,7 +601,7 @@ async def async_stream_infer(self,
             sequence_end (bool): indicator for ending a sequence
             step (int): the offset of the k/v cache
             stop (bool): indicator for cancelling the session
-            gen_config (EngineGenerationConfig): generation config
+            gen_config (GenerationConfig): generation config
             stream_output (bool): indicator for stream output
             kwargs (dict): kwargs for backward compatibility
         """
@@ -663,8 +669,8 @@ async def async_stream_infer(self,
                     outputs = EngineOutput(status, output[:-1].tolist(),
                                            len_ - 1)
                 elif len(output) > 0 and \
-                    gen_config.stop_words is not None and \
-                        output[-1].item() in gen_config.stop_words:
+                    gen_config.stop_token_ids is not None and \
+                        output[-1].item() in gen_config.stop_token_ids:
                     outputs = EngineOutput(status, output[:-1].tolist(), len_)
                 else:
                     outputs = EngineOutput(status, output.tolist(), len_)
@@ -697,7 +703,7 @@ def stream_infer(self,
                      sequence_end: bool = False,
                      step=0,
                      stop=False,
-                     gen_config: EngineGenerationConfig = None,
+                     gen_config: GenerationConfig = None,
                      stream_output=False,
                      **kwargs):
         """Perform model inference.
@@ -712,7 +718,7 @@ def stream_infer(self,
             sequence_end (bool): indicator for ending a sequence
             step (int): the offset of the k/v cache
             stop (bool): indicator for cancelling the session
-            gen_config (EngineGenerationConfig): generation config
+            gen_config (GenerationConfig): generation config
             stream_output (bool): indicator for stream output
             kwargs (dict): kwargs for backward compatibility
         """
@@ -776,8 +782,8 @@ def stream_infer(self,
                     outputs = EngineOutput(status, output[:-1].tolist(),
                                            len_ - 1, out_logprobs)
                 elif len(output) > 0 and \
-                    gen_config.stop_words is not None and \
-                        output[-1].item() in gen_config.stop_words:
+                    gen_config.stop_token_ids is not None and \
+                        output[-1].item() in gen_config.stop_token_ids:
                     outputs = EngineOutput(status, output[:-1].tolist(), len_,
                                            out_logprobs)
                 else:
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index de1bf04efb..206dd6d08c 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -200,6 +200,7 @@ def get_model(pretrained_model_name_or_path: str,
         download_kwargs['token'] = token
 
     model_path = snapshot_download(pretrained_model_name_or_path,
+                                   ignore_patterns=['*.pth'],
                                    **download_kwargs)
     return model_path
 
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index c6a1e74444..e7a55891e3 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -5,6 +5,7 @@ fire
 mmengine-lite
 numpy<2.0.0
 openai
+outlines
 peft<=0.11.1
 pillow
 protobuf
diff --git a/requirements/test.txt b/requirements/test.txt
index d06440a9d7..607907dffd 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -3,6 +3,7 @@ coverage
 pynvml
 pytest
 pytest-assume
+pytest-cov
 pytest-order
 pytest-rerunfailures
 pytest-sugar
diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt
index ac8c47d774..26c580714a 100644
--- a/src/turbomind/triton_backend/llama/CMakeLists.txt
+++ b/src/turbomind/triton_backend/llama/CMakeLists.txt
@@ -25,5 +25,5 @@ set(llama_triton_backend_files
 find_package(CUDAToolkit REQUIRED)
 add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
 set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
-target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt)
+target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt yaml-cpp::yaml-cpp)
 target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 58fec72e88..e2a564aa44 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -19,7 +19,6 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
 
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "3rdparty/INIReader.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaInstanceComm.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
@@ -30,34 +29,39 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include <cuda_runtime.h>
 #include <mutex>
+#include <yaml-cpp/yaml.h>
 
 namespace ft = turbomind;
 
-std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string config_file)
 {
-    INIReader reader = INIReader(inifile);
-    if (reader.ParseError() < 0) {
-        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
-        return nullptr;
+    YAML::Node reader;
+    try {
+        reader = YAML::Load(config_file);
+    }
+    catch (const YAML::Exception& e) {
+        std::cerr << "Error reading YAML config: " << e.what() << std::endl;
+        ft::FT_CHECK(false);
     }
 
-    const std::string data_type        = reader.Get("ft_instance_hyperparameter", "data_type");
-    int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
-    std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
+    const auto        ft_instance_hyperparameter = reader["ft_instance_hyperparameter"];
+    const std::string data_type                  = ft_instance_hyperparameter["data_type"].as<std::string>();
+    int               tensor_para_size           = ft_instance_hyperparameter["tensor_para_size"].as<int>();
+    std::string       model_dir                  = ft_instance_hyperparameter["model_dir"].as<std::string>();
 
     if (data_type == "half" || data_type == "fp16") {
         return std::make_shared<LlamaTritonModel<half>>(
-            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
+            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
+            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
             model_dir);
     }
     else if (data_type == "bf16") {
 #ifdef ENABLE_BF16
         return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
-            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
+            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
+            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16");
@@ -67,9 +71,9 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
     else {
 #ifdef ENABLE_FP32
         return std::make_shared<LlamaTritonModel<float>>(
-            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
-            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            ft_instance_hyperparameter["tensor_para_size"].as<int>(),
+            ft_instance_hyperparameter["pipeline_para_size"].as<int>(),
+            ft_instance_hyperparameter["enable_custom_all_reduce"].as<int>(0),
             model_dir);
 #else
         TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32");
@@ -189,81 +193,81 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     weights_(ft::getDeviceCount()),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
-    INIReader reader;
     FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
 
-    if (!model_dir.empty()) {
-        model_dir_ = model_dir;
-        const std::string inifile{model_dir + "/config.ini"};
-        reader = INIReader(inifile);
-        if (reader.ParseError() < 0) {
-            TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
-            ft::FT_CHECK(false);
+    YAML::Node reader;
+
+    try {
+        if (!model_dir.empty()) {
+            model_dir_ = model_dir;
+            const std::string config_file{model_dir + "/config.yaml"};
+            reader = YAML::LoadFile(config_file);
         }
-    }
 
-    if (!config.empty()) {
-        std::FILE* tmpf = std::tmpfile();
-        std::fputs(config.c_str(), tmpf);
-        std::rewind(tmpf);
-        reader = INIReader(tmpf);
-        if (reader.ParseError() < 0) {
-            TM_LOG_ERROR("[ERROR] Can't init with config %s", config.c_str());
-            ft::FT_CHECK(false);
+        if (!config.empty()) {
+            reader = YAML::Load(config);
         }
     }
+    catch (const YAML::Exception& e) {
+        std::cerr << "Error reading YAML config: " << e.what() << std::endl;
+        ft::FT_CHECK(false);
+    }
 
-    model_name_                     = reader.Get("llama", "model_name");
-    model_param_.head_num           = reader.GetInteger("llama", "head_num");
-    model_param_.head_dim           = reader.GetInteger("llama", "size_per_head");
-    model_param_.kv_head_num        = reader.GetInteger("llama", "kv_head_num", 0);
-    model_param_.hidden_units       = reader.GetInteger("llama", "hidden_units");
-    model_param_.layer_num          = reader.GetInteger("llama", "num_layer");
-    model_param_.inter_size         = reader.GetInteger("llama", "inter_size");
-    model_param_.vocab_size         = reader.GetInteger("llama", "vocab_size");
-    model_param_.norm_eps           = reader.GetFloat("llama", "norm_eps");
-    model_param_.start_id           = reader.GetInteger("llama", "start_id");
-    model_param_.end_id             = reader.GetInteger("llama", "end_id");
-    attn_param_.cache_block_seq_len = reader.GetInteger("llama", "cache_block_seq_len", 0);
-    model_param_.quant_policy       = reader.GetInteger("llama", "quant_policy", 0);
+    const auto model_reader     = reader["model_config"];
+    const auto attention_reader = reader["attention_config"];
+    const auto lora_reader      = reader["lora_config"];
+    const auto engine_reader    = reader["engine_config"];
+
+    model_name_                     = model_reader["model_name"].as<std::string>();
+    model_param_.head_num           = model_reader["head_num"].as<int>();
+    model_param_.head_dim           = model_reader["size_per_head"].as<int>();
+    model_param_.kv_head_num        = model_reader["kv_head_num"].as<int>(0);
+    model_param_.hidden_units       = model_reader["hidden_units"].as<int>();
+    model_param_.layer_num          = model_reader["num_layer"].as<int>();
+    model_param_.inter_size         = model_reader["inter_size"].as<int>();
+    model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
+    model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
+    model_param_.start_id           = model_reader["start_id"].as<int>();
+    model_param_.end_id             = model_reader["end_id"].as<int>();
+    attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as<int>(0);
+    model_param_.quant_policy       = engine_reader["quant_policy"].as<int>(0);
 
     // Only weight classes need these
-    attn_bias_  = reader.GetInteger("llama", "attn_bias", 0);
-    group_size_ = reader.GetInteger("llama", "group_size", 0);
+    attn_bias_  = model_reader["attn_bias"].as<int>(0);
+    group_size_ = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
-    attn_param_.rotary_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
-    attn_param_.rotary_embedding_base   = reader.GetFloat("llama", "rope_theta", 10000.0f);
-    attn_param_.rope_scaling_type       = reader.Get("llama", "rope_scaling_type", "");
-    attn_param_.rope_scaling_factor     = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
-    attn_param_.low_freq_factor         = reader.GetFloat("llama", "low_freq_factor", 1.0);
-    attn_param_.high_freq_factor        = reader.GetFloat("llama", "high_freq_factor", 1.0);
-    attn_param_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
-    attn_param_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
-    attn_param_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);
-
-    attn_param_.original_max_position_embeddings = reader.GetInteger("llama", "original_max_position_embeddings", 0);
-
-    engine_param_.max_batch_size        = reader.GetInteger("llama", "max_batch_size", 0);
-    engine_param_.max_prefill_token_num = reader.GetInteger("llama", "max_prefill_token_num", 0);
-    engine_param_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0);
-    engine_param_.session_len           = reader.GetInteger("llama", "session_len", 0);
-    engine_param_.step_length           = reader.GetInteger("llama", "step_length", 0);
-
-    engine_param_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0);
-    engine_param_.cache_chunk_size      = reader.GetInteger("llama", "cache_chunk_size", 0);
-    engine_param_.enable_prefix_caching = reader.GetBoolean("llama", "enable_prefix_caching", false);
-
-    engine_param_.num_tokens_per_iter = reader.GetInteger("llama", "num_tokens_per_iter", 0);
-    engine_param_.max_prefill_iters   = reader.GetInteger("llama", "max_prefill_iters", 1);
-
-    lora_param_.policy        = ft::getLoraPolicy(reader.Get("llama", "lora_policy", ""));
-    lora_param_.r             = reader.GetInteger("llama", "lora_r", 0);
-    lora_param_.scale         = reader.GetFloat("llama", "lora_scale", 0);
-    lora_param_.max_wo_r      = reader.GetInteger("llama", "lora_max_wo_r", 0);
-    lora_param_.rank_pattern  = getLoraPattern<int>(reader.Get("llama", "lora_rank_pattern", ""),
+    attn_param_.rotary_embedding_dim    = attention_reader["rotary_embedding"].as<int>();
+    attn_param_.rotary_embedding_base   = attention_reader["rope_theta"].as<float>(10000.0f);
+    attn_param_.rope_scaling_type       = attention_reader["rope_scaling_type"].as<std::string>("");
+    attn_param_.rope_scaling_factor     = attention_reader["rope_scaling_factor"].as<float>(0.f);
+    attn_param_.low_freq_factor         = attention_reader["low_freq_factor"].as<float>(1.0);
+    attn_param_.high_freq_factor        = attention_reader["high_freq_factor"].as<float>(1.0);
+    attn_param_.max_position_embeddings = attention_reader["max_position_embeddings"].as<int>(0);
+    attn_param_.use_dynamic_ntk         = attention_reader["use_dynamic_ntk"].as<int>(0);
+    attn_param_.use_logn_attn           = attention_reader["use_logn_attn"].as<int>(0);
+
+    attn_param_.original_max_position_embeddings = attention_reader["original_max_position_embeddings"].as<int>(0);
+
+    engine_param_.max_batch_size        = engine_reader["max_batch_size"].as<int>(0);
+    engine_param_.max_prefill_token_num = engine_reader["max_prefill_token_num"].as<int>(0);
+    engine_param_.max_context_token_num = engine_reader["max_context_token_num"].as<int>(0);
+    engine_param_.session_len           = model_reader["session_len"].as<int>(0);
+
+    engine_param_.cache_max_block_count = engine_reader["cache_max_entry_count"].as<float>(0);
+    engine_param_.cache_chunk_size      = engine_reader["cache_chunk_size"].as<int>(0);
+    engine_param_.enable_prefix_caching = engine_reader["enable_prefix_caching"].as<bool>(false);
+
+    engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as<int>(0);
+    engine_param_.max_prefill_iters   = engine_reader["max_prefill_iters"].as<int>(1);
+
+    lora_param_.policy        = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as<std::string>(""));
+    lora_param_.r             = lora_reader["lora_r"].as<int>(0);
+    lora_param_.scale         = lora_reader["lora_scale"].as<float>(0);
+    lora_param_.max_wo_r      = lora_reader["lora_max_wo_r"].as<int>(0);
+    lora_param_.rank_pattern  = getLoraPattern<int>(lora_reader["lora_rank_pattern"].as<std::string>(""),
                                                    [](const std::string& s) { return std::stoi(s); });
-    lora_param_.scale_pattern = getLoraPattern<float>(reader.Get("llama", "lora_scale_pattern", ""),
+    lora_param_.scale_pattern = getLoraPattern<float>(lora_reader["lora_scale_pattern"].as<std::string>(""),
                                                       [](const std::string& s) { return std::stof(s); });
     handleMissingParams();
 
@@ -273,7 +277,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     const auto device_count = ft::getDeviceCount();
     engines_.resize(device_count);
 
-    const std::string weight_type_str = reader.Get("llama", "weight_type");
+    const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16") {
         weight_type_ = ft::WeightType::kFP16;
     }
diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc
index db783c5637..c13688ff3a 100644
--- a/src/turbomind/utils/cuda_utils.cc
+++ b/src/turbomind/utils/cuda_utils.cc
@@ -366,33 +366,6 @@ cudaError_t getSetDevice(int i_device, int* o_device)
     return cudaSuccess;
 }
 
-FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
-{
-    FtCudaDataType model_file_type;
-    INIReader      reader = INIReader(ini_file);
-    if (reader.ParseError() < 0) {
-        TM_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
-        model_file_type = FtCudaDataType::FP32;
-    }
-    else {
-        std::string weight_data_type_str = std::string(reader.Get(section_name, "weight_data_type"));
-        if (weight_data_type_str.find("fp32") != std::string::npos) {
-            model_file_type = FtCudaDataType::FP32;
-        }
-        else if (weight_data_type_str.find("fp16") != std::string::npos) {
-            model_file_type = FtCudaDataType::FP16;
-        }
-        else if (weight_data_type_str.find("bf16") != std::string::npos) {
-            model_file_type = FtCudaDataType::BF16;
-        }
-        else {
-            TM_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
-            model_file_type = FtCudaDataType::FP32;
-        }
-    }
-    return model_file_type;
-}
-
 bool is_16xx_series(const char* name)
 {
     const std::regex re(R"(GTX 16\d\d)");
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index 533263604e..2148fcc164 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "3rdparty/INIReader.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include "src/turbomind/utils/logger.h"
 
+#include <algorithm>
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -384,8 +384,6 @@ struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
 };
 #endif
 
-FtCudaDataType getModelFileType(std::string ini_file, std::string section_name);
-
 // clang-format off
 template<typename T> struct packed_type;
 template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
diff --git a/tests/test_lmdeploy/test_async_engine.py b/tests/test_lmdeploy/test_async_engine.py
index 872b6b1abc..0123b2a43c 100644
--- a/tests/test_lmdeploy/test_async_engine.py
+++ b/tests/test_lmdeploy/test_async_engine.py
@@ -1,4 +1,3 @@
-import configparser
 import os
 import tempfile
 
@@ -23,13 +22,12 @@ def test_get_names_from_turbomind_model():
     os.makedirs(os.path.join(workspace, 'triton_models', 'weights'),
                 exist_ok=True)
 
-    expected_chat_template = 'internlm2'
-    config = configparser.ConfigParser()
-    config.add_section('llama')
-    config.set('llama', 'chat_template', expected_chat_template)
+    import yaml
 
-    with open(f'{workspace}/triton_models/weights/config.ini', 'w') as f:
-        config.write(f)
+    expected_chat_template = 'internlm2'
+    config = dict(model_config=dict(chat_template=expected_chat_template))
+    with open(f'{workspace}/triton_models/weights/config.yaml', 'w') as f:
+        yaml.safe_dump(config, f)
 
     _, chat_template = get_names_from_model(workspace)
     assert chat_template == expected_chat_template
diff --git a/tests/test_lmdeploy/test_messages.py b/tests/test_lmdeploy/test_messages.py
index f3d44355df..0453602c71 100644
--- a/tests/test_lmdeploy/test_messages.py
+++ b/tests/test_lmdeploy/test_messages.py
@@ -1,15 +1,13 @@
 from typing import List
 
-from lmdeploy import EngineGenerationConfig, GenerationConfig, Tokenizer
+from lmdeploy import GenerationConfig, Tokenizer
 
 
 def test_engine_generation_config():
     tokenizer = Tokenizer('internlm/internlm-chat-7b')
     config = GenerationConfig(n=3, stop_words=['<eoa>'])
-    _config = EngineGenerationConfig.From(config, tokenizer)
-
-    assert _config.n == config.n == 3 and \
-        _config.max_new_tokens == config.max_new_tokens and \
-        _config.temperature == config.temperature
-    assert isinstance(_config.stop_words, List) and \
-        isinstance(_config.stop_words[0], int)
+    stop_token_ids = tokenizer.encode('<eoa>', add_bos=False)
+    config.convert_stop_bad_words_to_ids(tokenizer)
+    assert stop_token_ids == config.stop_token_ids
+    assert isinstance(config.stop_token_ids, List) and \
+        isinstance(config.stop_token_ids[0], int)
diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
index 95b3e691a6..0d125fe74c 100644
--- a/tests/test_lmdeploy/test_turbomind/test_converter.py
+++ b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -44,10 +44,10 @@ def test_registered_models():
         output_name, config, _ = get_output_model_registered_name_and_config(
             model, model_format=model_format, group_size=0)
         assert output_name == register_name
-        assert config.group_size == group_size
+        assert config.model_config.group_size == group_size
         assert config.weight_type == weight_type
         assert config.session_len > 0
-        assert config.model_arch is not None
+        assert config.model_config.model_arch is not None
 
 
 def test_update_from_engine_config():
@@ -61,26 +61,7 @@ def test_update_from_engine_config():
     config = copy.deepcopy(_config)
     config.update_from_engine_config(TurbomindEngineConfig())
     assert config.tensor_para_size == 1
-    assert config.session_len == 32776
-    assert config.max_batch_size == 128
-    assert config.cache_max_entry_count == 0.8
-    assert config.quant_policy == 0
-    assert config.max_prefill_iters == 5
-    assert config.num_tokens_per_iter == 8192
-
-    config = copy.deepcopy(_config)
-    config.update_from_engine_config(
-        TurbomindEngineConfig(max_prefill_token_num=2048,
-                              num_tokens_per_iter=0))
-    assert config.max_prefill_iters == 17
-    assert config.num_tokens_per_iter == 2048
-
-    config = copy.deepcopy(_config)
-    config.update_from_engine_config(
-        TurbomindEngineConfig(max_prefill_token_num=2048,
-                              num_tokens_per_iter=256))
-    assert config.max_prefill_iters == 1
-    assert config.num_tokens_per_iter == 256
+    assert config.session_len == 32768
 
     config = copy.deepcopy(_config)
     engine_config = TurbomindEngineConfig(model_format='hf',
@@ -98,11 +79,9 @@ def test_update_from_engine_config():
 
     assert (config.tensor_para_size == engine_config.tp)
     assert (config.session_len == engine_config.session_len)
-    assert (config.max_batch_size == engine_config.max_batch_size)
+    assert (config.attention_config.rope_scaling_factor ==
+            engine_config.rope_scaling_factor)
+    assert (config.attention_config.rope_scaling_factor ==
+            engine_config.rope_scaling_factor)
     assert (
-        config.cache_max_entry_count == engine_config.cache_max_entry_count)
-    assert (config.quant_policy == engine_config.quant_policy)
-    assert (config.rope_scaling_factor == engine_config.rope_scaling_factor)
-    assert (config.use_logn_attn == engine_config.use_logn_attn)
-    assert (config.max_prefill_iters == engine_config.max_prefill_iters)
-    assert (config.num_tokens_per_iter == engine_config.num_tokens_per_iter)
+        config.attention_config.use_logn_attn == engine_config.use_logn_attn)
diff --git a/tests/test_lmdeploy/test_utils.py b/tests/test_lmdeploy/test_utils.py
index ebafdc2634..cdabf98a24 100644
--- a/tests/test_lmdeploy/test_utils.py
+++ b/tests/test_lmdeploy/test_utils.py
@@ -1,6 +1,8 @@
 from transformers import AutoConfig
 
-from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig
+from lmdeploy.turbomind.deploy.config import (ModelConfig,
+                                              TurbomindModelConfig,
+                                              config_from_dict)
 from lmdeploy.utils import _get_and_verify_max_len
 
 
@@ -20,7 +22,7 @@ def test_get_and_verify_max_len():
     assert (_get_and_verify_max_len(config, 102400) == 102400)
 
     # with TurbomindModelConfig
-    config = TurbomindModelConfig.from_dict({}, allow_none=True)
-    config.session_len = 4096
+    config = config_from_dict(TurbomindModelConfig, {})
+    config.model_config = config_from_dict(ModelConfig, dict(session_len=4096))
     assert (_get_and_verify_max_len(config, None) == config.session_len)
     assert (_get_and_verify_max_len(config, 1024) == 1024)