Merge remote-tracking branch 'origin/main' into split

irexyc · Aug 26, 2024 · 8ccbec3 · 8ccbec3
2 parents b7dc61b + ada7285
commit 8ccbec3
Show file tree

Hide file tree

Showing 322 changed files with 17,578 additions and 8,818 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Check docstring coverage
         run: |
           python -m pip install interrogate
-          interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 80 lmdeploy
+          interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 70 lmdeploy
       - name: Check pylint score
         run: |
           python -m pip install pylint

diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
@@ -24,6 +24,7 @@ env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
 
 
 jobs:
@@ -33,58 +34,54 @@ jobs:
     env:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
-      image: nvcr.io/nvidia/tritonserver:24.03-py3
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Setup systems
         run: |
-          rm /etc/apt/sources.list.d/cuda*.list
-          apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
-              libgoogle-glog-dev libgl1 openjdk-8-jre-headless
-          rm -rf /var/lib/apt/lists/*
+          rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
+          add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
+          ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
+          && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
+          echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
       - name: Clone repository
         uses: actions/checkout@v2
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
-          python3 -m pip install torch==2.1.0 torchvision==0.16.0
+          python3 -m pip install --upgrade pip setuptools==69.5.1
+          python3 -m pip install torch==2.3.0 torchvision==0.18.0
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-2.5.8+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu123torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
       - name: Build lmdeploy
         run: |
-          python3 -m pip install cmake
-          python3 -m pip install -r requirements/build.txt
-          mkdir build
-          cd build
-          cp -r /nvme/qa_test_models/offline_pkg/_deps .
-          cmake .. \
-              -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-              -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
-              -DBUILD_PY_FFI=ON \
-              -DBUILD_MULTI_GPU=ON \
-              -DCMAKE_CUDA_FLAGS="-lineinfo" \
-              -DUSE_NVTX=ON \
-              -DSM=80 \
-              -DCMAKE_CUDA_ARCHITECTURES=80 \
-              -DBUILD_TEST=OFF
-          make -j$(nproc) && make install
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip install packaging transformers_stream_generator transformers datasets openai einops
-          python3 -m pip install -r requirements.txt -r requirements/test.txt
-          python3 -m pip install .
+          cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
+          tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
+          make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
+          export PATH=$PATH:/usr/local/openmpi/bin
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
+          python3 -m pip install cmake packaging wheel transformers_stream_generator transformers datasets openai einops timm decord
+          python3 -m pip install -r requirements.txt -r requirements/test.txt -r requirements/build.txt
+          mkdir -p build && cd build &&\
+          sh ../generate.sh &&\
+          ninja -j$(nproc) && ninja install &&\
+          cd .. &&\
+          python3 -m pip install -e . &&\
+          rm -rf build
       - name: Check env
         run: |
           python3 -m pip list
           lmdeploy check_env
       - name: Test lmdeploy
-        run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -x --alluredir=allure-results --clean-alluredir
+        run: |
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
       - name: Generate reports
         if: always()
         run: |

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -65,7 +65,8 @@ jobs:
           python-version: ${{ matrix.pyver }}
       - name: Install python packages
         run: |
-          pip install pybind11 wheel
+          pip install -r requirements/build.txt
+          pip install wheel
       - name: Setup CUDA Toolkit
         id: cuda-toolkit
         shell: pwsh

diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
@@ -40,7 +40,8 @@ jobs:
           python-version: '3.8'
       - name: Install python packages
         run: |
-          pip install pybind11 wheel
+          pip install -r requirements/build.txt
+          pip install wheel
       - name: Setup CUDA Toolkit
         id: cuda-toolkit
         shell: pwsh

diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,7 @@ dist/
 examples/cpp/llama/*.csv
 *.npy
 *.weight
+install/
 
 # LMDeploy
 workspace/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=third_party/*,*.ipynb,*.proto"]
+        args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]
 
   - repo: https://github.com/myint/docformatter
     rev: v1.4

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -143,6 +143,9 @@ if (NOT CMAKE_CUDA_ARCHITECTURES)
   if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8")
     list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-real)
   endif ()
+  if (MSVC)
+    list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES 80-real 90-real)
+  endif ()
 endif ()
 
 message(STATUS "Building with CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -322,7 +325,6 @@ add_library(transformer-shared SHARED
   $<TARGET_OBJECTS:DynamicDecodeLayer>
   $<TARGET_OBJECTS:Llama>
   $<TARGET_OBJECTS:LlamaTritonBackend>
-  $<TARGET_OBJECTS:gemm_s4_f16>
   $<TARGET_OBJECTS:TopKSamplingLayer>
   $<TARGET_OBJECTS:TopPSamplingLayer>
   $<TARGET_OBJECTS:TransformerTritonBackend>

diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
 - \[2024/07\] 🎉🎉 Support Llama3.1 8B, 70B and its TOOLS CALLING
-- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](docs/en/multi_modal/internvl.md) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
 - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2
@@ -39,8 +39,8 @@ ______________________________________________________________________
 - \[2024/03\] Support DeepSeek-VL offline inference pipeline and serving.
 - \[2024/03\] Support VLM offline inference pipeline and serving.
 - \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.
-- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md).
-- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md)
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](docs/en/llm/api_server.md).
+- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](docs/en/llm/proxy_server.md)
 - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable  rapid experimentation with new features and technologies.
 
 </details>
@@ -53,7 +53,7 @@ ______________________________________________________________________
 - \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
 - \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
-- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
+- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/llm/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
@@ -136,6 +136,8 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Dbrx (132B)</li>
   <li>StarCoder2 (3B - 15B)</li>
   <li>Phi-3-mini (3.8B)</li>
+  <li>Phi-3.5-mini (3.8B)</li>
+  <li>Phi-3.5-MoE (16x3.8B)</li>
 </ul>
 </td>
 <td>
@@ -151,7 +153,9 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>CogVLM-Chat (17B)</li>
   <li>CogVLM2-Chat (19B)</li>
   <li>MiniCPM-Llama3-V-2_5</li>
+  <li>MiniCPM-V-2_6</li>
   <li>Phi-3-vision (4.2B)</li>
+  <li>Phi-3.5-vision (4.2B)</li>
   <li>GLM-4V (9B)</li>
 </ul>
 </td>
@@ -167,19 +171,16 @@ They differ in the types of supported models and the inference data type. Please
 
 ## Installation
 
-Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
+It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12):
 
 ```shell
+conda create -n lmdeploy python=3.8 -y
+conda activate lmdeploy
 pip install lmdeploy
 ```
 
-Since v0.3.0, The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:
-
-```shell
-export LMDEPLOY_VERSION=0.5.2
-export PYTHON_VERSION=38
-pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
-```
+The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
+For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md).
 
 ## Offline Batch Inference
 
@@ -195,7 +196,7 @@ print(response)
 >
 > `export LMDEPLOY_USE_MODELSCOPE=True`
 
-For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
+For more information about inference pipeline, please refer to [here](docs/en/llm/pipeline.md).
 
 # Tutorials
 
@@ -204,10 +205,10 @@ Please review [getting_started](./docs/en/get_started.md) section for the basic
 For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):
 
 - User Guide
-  - [LLM Inference pipeline](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
-  - [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
-  - [LLM Serving](docs/en/serving/api_server.md)
-  - [VLM Serving](docs/en/serving/api_server_vl.md)
+  - [LLM Inference pipeline](docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
+  - [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
+  - [LLM Serving](docs/en/llm/api_server.md)
+  - [VLM Serving](docs/en/multi_modal/api_server_vl.md)
   - [Quantization](docs/en/quantization)
 - Advance Guide
   - [Inference Engine - TurboMind](docs/en/inference/turbomind.md)
@@ -216,7 +217,7 @@ For detailed user guides and advanced guides, please refer to our [tutorials](ht
   - [Add a new model](docs/en/advance/pytorch_new_model.md)
   - gemm tuning
   - [Long context inference](docs/en/advance/long_context.md)
-  - [Multi-model inference service](docs/en/serving/proxy_server.md)
+  - [Multi-model inference service](docs/en/llm/proxy_server.md)
 
 # Third-party projects