Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into split
Browse files Browse the repository at this point in the history
  • Loading branch information
irexyc committed Aug 26, 2024
2 parents b7dc61b + ada7285 commit 8ccbec3
Show file tree
Hide file tree
Showing 322 changed files with 17,578 additions and 8,818 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
- name: Check docstring coverage
run: |
python -m pip install interrogate
interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 80 lmdeploy
interrogate -v --exclude ./lmdeploy/pytorch_poc/modeling/ --ignore-init-method --ignore-magic --ignore-module --ignore-private --ignore-nested-functions --ignore-nested-classes --fail-under 70 lmdeploy
- name: Check pylint score
run: |
python -m pip install pylint
Expand Down
57 changes: 27 additions & 30 deletions .github/workflows/pr_ete_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ env:
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA


jobs:
Expand All @@ -33,58 +34,54 @@ jobs:
env:
REPORT_DIR: /nvme/qa_test_models/test-reports
container:
image: nvcr.io/nvidia/tritonserver:24.03-py3
image: nvidia/cuda:12.4.1-devel-ubuntu22.04
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
volumes:
- /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
- /nvme/share_data/github-actions/packages:/root/packages
- /nvme/qa_test_models:/nvme/qa_test_models
- /mnt/187:/mnt/187
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Setup systems
run: |
rm /etc/apt/sources.list.d/cuda*.list
apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
libgoogle-glog-dev libgl1 openjdk-8-jre-headless
rm -rf /var/lib/apt/lists/*
rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
- name: Clone repository
uses: actions/checkout@v2
- name: Install pytorch
run: |
python3 -m pip cache dir
python3 -m pip install torch==2.1.0 torchvision==0.16.0
python3 -m pip install --upgrade pip setuptools==69.5.1
python3 -m pip install torch==2.3.0 torchvision==0.18.0
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.5.8+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
python3 -m pip install /root/packages/flash_attn-2.6.3+cu123torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
- name: Build lmdeploy
run: |
python3 -m pip install cmake
python3 -m pip install -r requirements/build.txt
mkdir build
cd build
cp -r /nvme/qa_test_models/offline_pkg/_deps .
cmake .. \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON \
-DSM=80 \
-DCMAKE_CUDA_ARCHITECTURES=80 \
-DBUILD_TEST=OFF
make -j$(nproc) && make install
- name: Install lmdeploy
run: |
python3 -m pip install packaging transformers_stream_generator transformers datasets openai einops
python3 -m pip install -r requirements.txt -r requirements/test.txt
python3 -m pip install .
cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
export PATH=$PATH:/usr/local/openmpi/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
python3 -m pip install cmake packaging wheel transformers_stream_generator transformers datasets openai einops timm decord
python3 -m pip install -r requirements.txt -r requirements/test.txt -r requirements/build.txt
mkdir -p build && cd build &&\
sh ../generate.sh &&\
ninja -j$(nproc) && ninja install &&\
cd .. &&\
python3 -m pip install -e . &&\
rm -rf build
- name: Check env
run: |
python3 -m pip list
lmdeploy check_env
- name: Test lmdeploy
run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -x --alluredir=allure-results --clean-alluredir
run: |
CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
- name: Generate reports
if: always()
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ jobs:
python-version: ${{ matrix.pyver }}
- name: Install python packages
run: |
pip install pybind11 wheel
pip install -r requirements/build.txt
pip install wheel
- name: Setup CUDA Toolkit
id: cuda-toolkit
shell: pwsh
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/windows-x64-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ jobs:
python-version: '3.8'
- name: Install python packages
run: |
pip install pybind11 wheel
pip install -r requirements/build.txt
pip install wheel
- name: Setup CUDA Toolkit
id: cuda-toolkit
shell: pwsh
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ dist/
examples/cpp/llama/*.csv
*.npy
*.weight
install/

# LMDeploy
workspace/
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ repos:
rev: v2.1.0
hooks:
- id: codespell
args: ["--skip=third_party/*,*.ipynb,*.proto"]
args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]

- repo: https://github.com/myint/docformatter
rev: v1.4
Expand Down
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ if (NOT CMAKE_CUDA_ARCHITECTURES)
if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-real)
endif ()
if (MSVC)
list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES 80-real 90-real)
endif ()
endif ()

message(STATUS "Building with CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}")
Expand Down Expand Up @@ -322,7 +325,6 @@ add_library(transformer-shared SHARED
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:gemm_s4_f16>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
Expand Down
37 changes: 19 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ ______________________________________________________________________

- \[2024/08\] 🔥🔥 LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
- \[2024/07\] 🎉🎉 Support Llama3.1 8B, 70B and its TOOLS CALLING
- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
- \[2024/07\] Support [InternVL2](docs/en/multi_modal/internvl.md) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
- \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
- \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
- \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2
Expand All @@ -39,8 +39,8 @@ ______________________________________________________________________
- \[2024/03\] Support DeepSeek-VL offline inference pipeline and serving.
- \[2024/03\] Support VLM offline inference pipeline and serving.
- \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.
- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md).
- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md)
- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](docs/en/llm/api_server.md).
- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](docs/en/llm/proxy_server.md)
- \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies.

</details>
Expand All @@ -53,7 +53,7 @@ ______________________________________________________________________
- \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
- \[2023/09\] TurboMind supports Qwen-14B
- \[2023/09\] TurboMind supports InternLM-20B
- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/llm/codellama.md) for deployment guide
- \[2023/09\] TurboMind supports Baichuan2-7B
- \[2023/08\] TurboMind supports flash-attention2.
- \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
Expand Down Expand Up @@ -136,6 +136,8 @@ For detailed inference benchmarks in more devices and more settings, please refe
<li>Dbrx (132B)</li>
<li>StarCoder2 (3B - 15B)</li>
<li>Phi-3-mini (3.8B)</li>
<li>Phi-3.5-mini (3.8B)</li>
<li>Phi-3.5-MoE (16x3.8B)</li>
</ul>
</td>
<td>
Expand All @@ -151,7 +153,9 @@ For detailed inference benchmarks in more devices and more settings, please refe
<li>CogVLM-Chat (17B)</li>
<li>CogVLM2-Chat (19B)</li>
<li>MiniCPM-Llama3-V-2_5</li>
<li>MiniCPM-V-2_6</li>
<li>Phi-3-vision (4.2B)</li>
<li>Phi-3.5-vision (4.2B)</li>
<li>GLM-4V (9B)</li>
</ul>
</td>
Expand All @@ -167,19 +171,16 @@ They differ in the types of supported models and the inference data type. Please

## Installation

Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12):

```shell
conda create -n lmdeploy python=3.8 -y
conda activate lmdeploy
pip install lmdeploy
```

Since v0.3.0, The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by:

```shell
export LMDEPLOY_VERSION=0.5.2
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md).

## Offline Batch Inference

Expand All @@ -195,7 +196,7 @@ print(response)
>
> `export LMDEPLOY_USE_MODELSCOPE=True`
For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
For more information about inference pipeline, please refer to [here](docs/en/llm/pipeline.md).

# Tutorials

Expand All @@ -204,10 +205,10 @@ Please review [getting_started](./docs/en/get_started.md) section for the basic
For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):

- User Guide
- [LLM Inference pipeline](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
- [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
- [LLM Serving](docs/en/serving/api_server.md)
- [VLM Serving](docs/en/serving/api_server_vl.md)
- [LLM Inference pipeline](docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ)
- [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing)
- [LLM Serving](docs/en/llm/api_server.md)
- [VLM Serving](docs/en/multi_modal/api_server_vl.md)
- [Quantization](docs/en/quantization)
- Advance Guide
- [Inference Engine - TurboMind](docs/en/inference/turbomind.md)
Expand All @@ -216,7 +217,7 @@ For detailed user guides and advanced guides, please refer to our [tutorials](ht
- [Add a new model](docs/en/advance/pytorch_new_model.md)
- gemm tuning
- [Long context inference](docs/en/advance/long_context.md)
- [Multi-model inference service](docs/en/serving/proxy_server.md)
- [Multi-model inference service](docs/en/llm/proxy_server.md)

# Third-party projects

Expand Down
Loading

0 comments on commit 8ccbec3

Please sign in to comment.