diff --git a/.github/workflows/cuda11.8-whl-release.yml b/.github/workflows/cuda11.8-whl-release.yml index a762e32b74..b167b33c38 100644 --- a/.github/workflows/cuda11.8-whl-release.yml +++ b/.github/workflows/cuda11.8-whl-release.yml @@ -46,11 +46,12 @@ jobs: sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - name: Upload Artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: if-no-files-found: error path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/* retention-days: 1 + name: linux-${{ matrix.pyver }} windows-build: strategy: @@ -89,11 +90,12 @@ jobs: rm build -Force -Recurse python setup.py bdist_wheel -d build/wheel - name: Upload Artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: if-no-files-found: error path: build/wheel/* retention-days: 1 + name: windows-${{ matrix.pyver }} publish: runs-on: ubuntu-latest @@ -105,11 +107,15 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Download artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 + with: + path: artifact + merge-multiple: true - name: Add cuda version to package name run: | ver=$(cat lmdeploy/version.py | grep '__version__ =' | cut -d\' -f2) cuver=$ver+cu118 + ls -lh cd artifact for file in *; do mv "$file" "`echo $file | sed "s/$ver/$cuver/g"`"; diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index e7e03d44f9..bd7d6c259f 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -32,7 +32,7 @@ on: required: true description: 'Dependency packages, you can also set a specific version' type: string - default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord' + default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq' tools_regression: required: true description: 'Whether start a tool regression' @@ -58,11 +58,12 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache - dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord'}} + dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq'}} HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }} + COV_PATH: /opt/py3/lib/python3.10/site-packages/lmdeploy jobs: linux-build: @@ -170,81 +171,105 @@ jobs: continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | - pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - quantization w8a8 continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'quantization')) run: | - pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - convert continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'convert')) run: | - pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/convert -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat workspace continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat hf turbomind continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - chat hf torch continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'chat')) run: | - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline turbomind continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline torch continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - pipeline turbomind vl continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'pipeline')) run: | - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful turbomind continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful turbomind vl continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind-vl') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_turbomind_vl.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful workspace continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'turbomind') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_workspace.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - restful torch continue-on-error: true if: github.event_name == 'schedule' || (contains(fromJSON(github.event.inputs.backend), 'pytorch') && contains(fromJSON(github.event.inputs.model), 'restful')) run: | - pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} + pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_pytorch.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Test lmdeploy - local testcase if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.model), 'local_case') run: | - pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} + pytest /local_case/issue_regression --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}}|| true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() run: | @@ -323,7 +348,8 @@ jobs: - name: Test lmdeploy - restful api timeout-minutes: 75 run: | - pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} + pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Kill api server if: always() run: | @@ -343,7 +369,8 @@ jobs: - name: Test lmdeploy - restful api - base timeout-minutes: 40 run: | - pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Kill api server if: always() run: | @@ -409,10 +436,14 @@ jobs: rm -rf allure-results - name: Test lmdeploy - interface pipeline case run: | - pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} || true - pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} + pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() run: | @@ -476,12 +507,13 @@ jobs: lmdeploy check_env - name: Test benchmark script run: | - pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}} + pytest autotest/benchmark -n 4 --run_id ${{ github.run_id }} -m function --lf --alluredir=${{env.REPORT_DIR}} --cov ${{env.COV_PATH}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() run: | chmod -R 777 $REPORT_DIR - chmod -R 777 /nvme/qa_test_models/benchmark_reports/${{ github.run_id }} + chmod -R 777 /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} export workdir=$(pwd) cd .. rm -rf $workdir @@ -495,7 +527,7 @@ jobs: timeout-minutes: 5 runs-on: [self-hosted, linux-a100] env: - BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark_reports/${{ github.run_id }} + BENCHMARK_REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} steps: - name: Clone repository uses: actions/checkout@v3 @@ -507,17 +539,69 @@ jobs: pip install pandas fire mmengine python3 .github/scripts/action_tools.py generate_benchmark_report $BENCHMARK_REPORT_DIR + + get_coverage_report: + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + needs: [test_tools, test_restful, test_pipeline, test_benchmark] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest-cu11 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: cp -r /nvme/qa_test_models/offline_pkg/lmdeploy/. . + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Install lmdeploy + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: | + python3 -m pip install lmdeploy-*.whl + python3 -m pip install -r requirements/test.txt + - name: Install lmdeploy - offline + if: ${{inputs.offline_mode}} + run: | + python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + notify_to_feishu: if: always() && !cancelled() && (github.ref_name == 'develop' || github.ref_name == 'main') - needs: [test_tools, test_restful, test_pipeline, get_benchmark_result] + needs: [get_benchmark_result, get_coverage_report] timeout-minutes: 5 runs-on: [self-hosted, linux-a100] steps: - - name: fail notify + - name: notify if: contains(needs.*.result, 'failure') run: | - curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test failed!!!","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} - - name: success notify - if: needs.test_tools.result=='success' && needs.test_restful.result=='success' && needs.test_pipeline.result=='success' - run: | - curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test success","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} + curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Lmdeploy- Daily test finished!!!","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.FEISHU_USER_ID }}'"}]]}}}}' ${{ secrets.FEISHU_WEBHOOK_URL }} diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 64e482c2c8..5b17fccb22 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -113,13 +113,13 @@ jobs: - name: Install pytorch run: | python3 -m pip cache dir - python3 -m pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118 + python3 -m pip install torch==2.3.0 torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu118 - name: Install lmdeploy - dependency run: | # manually install flash attn # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases - python3 -m pip install /root/packages/flash_attn-2.5.7+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - python3 -m pip install -U 'xformers<=0.0.26' --index-url https://download.pytorch.org/whl/cu118 + python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl + python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps python3 -m pip install ${{env.dependency_pkgs}} - name: Install lmdeploy run: | diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml index 7c0ae8a24d..3a19ebe870 100644 --- a/.github/workflows/pr_ete_test.yml +++ b/.github/workflows/pr_ete_test.yml @@ -45,7 +45,8 @@ jobs: steps: - name: Setup systems run: | - rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\ + apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\ + curl https://sh.rustup.rs -sSf | sh -s -- -y &&\ add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \ ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3 diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index bcb992422f..4361e17dd4 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -45,11 +45,12 @@ jobs: sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - name: Upload Artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: if-no-files-found: error path: builder/manywheel/${{ env.OUTPUT_FOLDER }}/* retention-days: 1 + name: linux-${{ matrix.pyver }} windows-build: strategy: @@ -90,11 +91,12 @@ jobs: rm build -Force -Recurse python setup.py bdist_wheel -d build/wheel - name: Upload Artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: if-no-files-found: error path: build/wheel/* retention-days: 1 + name: windows-${{ matrix.pyver }} publish: runs-on: ubuntu-latest @@ -104,7 +106,10 @@ jobs: - windows-build steps: - name: Download artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 + with: + path: artifact + merge-multiple: true - name: Display artifacts run: ls artifact/ -lh - name: Set up python3.8 diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index fb68b5c37b..c946177c0e 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -78,7 +78,7 @@ jobs: strategy: fail-fast: false matrix: - model: ['internlm/internlm2-chat-20b'] + model: ['internlm/internlm2_5-20b-chat'] container: image: openmmlab/lmdeploy:latest-cu11 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 -e NO_PROXY=localhost,127.0.0.1 -e no_proxy=localhost,127.0.0.1 --pull never" @@ -134,10 +134,10 @@ jobs: sleep 120s - name: Test lmdeploy - restful api run: | - python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log - python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-3.log + python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv &> ${{env.REPORT_DIR}}/stable.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv &> ${{env.REPORT_DIR}}/stable-internal-1.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv &> ${{env.REPORT_DIR}}/stable-internal-2.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv &> ${{env.REPORT_DIR}}/stable-internal-3.log - name: Kill api server if: always() run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c6c0a45bf3..62f19298d2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,8 @@ repos: rev: v2.1.0 hooks: - id: codespell - args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"] + args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h,docker/Dockerfile_aarch64_ascend,docs/en/get_started/ascend/get_started.md,docs/zh_cn/get_started/ascend/get_started.md"] + - repo: https://github.com/myint/docformatter rev: v1.4 diff --git a/3rdparty/INIReader.h b/3rdparty/INIReader.h deleted file mode 100644 index 6ed9b5a5aa..0000000000 --- a/3rdparty/INIReader.h +++ /dev/null @@ -1,501 +0,0 @@ -// Read an INI file into easy-to-access name/value pairs. - -// inih and INIReader are released under the New BSD license. -// Go to the project home page for more info: -// -// https://github.com/benhoyt/inih (Initial repo) -// https://github.com/jtilly/inih (The reference of this header file) -/* inih -- simple .INI file parser -inih is released under the New BSD license (see LICENSE.txt). Go to the project -home page for more info: -https://github.com/benhoyt/inih -https://github.com/jtilly/inih -*/ - -#ifndef __INI_H__ -#define __INI_H__ - -/* Make this header file easier to include in C++ code */ -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/* Typedef for prototype of handler function. */ -typedef int (*ini_handler)(void* user, const char* section, - const char* name, const char* value); - -/* Typedef for prototype of fgets-style reader function. */ -typedef char* (*ini_reader)(char* str, int num, void* stream); - -/* Parse given INI-style file. May have [section]s, name=value pairs - (whitespace stripped), and comments starting with ';' (semicolon). Section - is "" if name=value pair parsed before any section heading. name:value - pairs are also supported as a concession to Python's configparser. - For each name=value pair parsed, call handler function with given user - pointer as well as section, name, and value (data only valid for duration - of handler call). Handler should return nonzero on success, zero on error. - Returns 0 on success, line number of first error on parse error (doesn't - stop on first error), -1 on file open error, or -2 on memory allocation - error (only when INI_USE_STACK is zero). -*/ -int ini_parse(const char* filename, ini_handler handler, void* user); - -/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't - close the file when it's finished -- the caller must do that. */ -int ini_parse_file(FILE* file, ini_handler handler, void* user); - -/* Same as ini_parse(), but takes an ini_reader function pointer instead of - filename. Used for implementing custom or string-based I/O. */ -int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, - void* user); - -/* Nonzero to allow multi-line value parsing, in the style of Python's - configparser. If allowed, ini_parse() will call the handler with the same - name for each subsequent line parsed. */ -#ifndef INI_ALLOW_MULTILINE -#define INI_ALLOW_MULTILINE 1 -#endif - -/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of - the file. See http://code.google.com/p/inih/issues/detail?id=21 */ -#ifndef INI_ALLOW_BOM -#define INI_ALLOW_BOM 1 -#endif - -/* Nonzero to allow inline comments (with valid inline comment characters - specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match - Python 3.2+ configparser behaviour. */ -#ifndef INI_ALLOW_INLINE_COMMENTS -#define INI_ALLOW_INLINE_COMMENTS 1 -#endif -#ifndef INI_INLINE_COMMENT_PREFIXES -#define INI_INLINE_COMMENT_PREFIXES ";" -#endif - -/* Nonzero to use stack, zero to use heap (malloc/free). */ -#ifndef INI_USE_STACK -#define INI_USE_STACK 1 -#endif - -/* Stop parsing on first error (default is to keep parsing). */ -#ifndef INI_STOP_ON_FIRST_ERROR -#define INI_STOP_ON_FIRST_ERROR 0 -#endif - -/* Maximum line length for any line in INI file. */ -#ifndef INI_MAX_LINE -#define INI_MAX_LINE 200 -#endif - -#ifdef __cplusplus -} -#endif - -/* inih -- simple .INI file parser -inih is released under the New BSD license (see LICENSE.txt). Go to the project -home page for more info: -https://github.com/benhoyt/inih -*/ - -#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -#define _CRT_SECURE_NO_WARNINGS -#endif - -#include -#include -#include - -#if !INI_USE_STACK -#include -#endif - -#define MAX_SECTION 50 -#define MAX_NAME 50 - -/* Strip whitespace chars off end of given string, in place. Return s. */ -inline static char* rstrip(char* s) -{ - char* p = s + strlen(s); - while (p > s && isspace((unsigned char)(*--p))) - *p = '\0'; - return s; -} - -/* Return pointer to first non-whitespace char in given string. */ -inline static char* lskip(const char* s) -{ - while (*s && isspace((unsigned char)(*s))) - s++; - return (char*)s; -} - -/* Return pointer to first char (of chars) or inline comment in given string, - or pointer to null at end of string if neither found. Inline comment must - be prefixed by a whitespace character to register as a comment. */ -inline static char* find_chars_or_comment(const char* s, const char* chars) -{ -#if INI_ALLOW_INLINE_COMMENTS - int was_space = 0; - while (*s && (!chars || !strchr(chars, *s)) && - !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) { - was_space = isspace((unsigned char)(*s)); - s++; - } -#else - while (*s && (!chars || !strchr(chars, *s))) { - s++; - } -#endif - return (char*)s; -} - -/* Version of strncpy that ensures dest (size bytes) is null-terminated. */ -inline static char* strncpy0(char* dest, const char* src, size_t size) -{ - strncpy(dest, src, size); - dest[size - 1] = '\0'; - return dest; -} - -/* See documentation in header file. */ -inline int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, - void* user) -{ - /* Uses a fair bit of stack (use heap instead if you need to) */ -#if INI_USE_STACK - char line[INI_MAX_LINE]; -#else - char* line; -#endif - char section[MAX_SECTION] = ""; - char prev_name[MAX_NAME] = ""; - - char* start; - char* end; - char* name; - char* value; - int lineno = 0; - int error = 0; - -#if !INI_USE_STACK - line = (char*)malloc(INI_MAX_LINE); - if (!line) { - return -2; - } -#endif - - /* Scan through stream line by line */ - while (reader(line, INI_MAX_LINE, stream) != NULL) { - lineno++; - - start = line; -#if INI_ALLOW_BOM - if (lineno == 1 && (unsigned char)start[0] == 0xEF && - (unsigned char)start[1] == 0xBB && - (unsigned char)start[2] == 0xBF) { - start += 3; - } -#endif - start = lskip(rstrip(start)); - - if (*start == ';' || *start == '#') { - /* Per Python configparser, allow both ; and # comments at the - start of a line */ - } -#if INI_ALLOW_MULTILINE - else if (*prev_name && *start && start > line) { - -#if INI_ALLOW_INLINE_COMMENTS - end = find_chars_or_comment(start, NULL); - if (*end) - *end = '\0'; - rstrip(start); -#endif - - /* Non-blank line with leading whitespace, treat as continuation - of previous name's value (as per Python configparser). */ - if (!handler(user, section, prev_name, start) && !error) - error = lineno; - } -#endif - else if (*start == '[') { - /* A "[section]" line */ - end = find_chars_or_comment(start + 1, "]"); - if (*end == ']') { - *end = '\0'; - strncpy0(section, start + 1, sizeof(section)); - *prev_name = '\0'; - } - else if (!error) { - /* No ']' found on section line */ - error = lineno; - } - } - else if (*start) { - /* Not a comment, must be a name[=:]value pair */ - end = find_chars_or_comment(start, "=:"); - if (*end == '=' || *end == ':') { - *end = '\0'; - name = rstrip(start); - value = lskip(end + 1); -#if INI_ALLOW_INLINE_COMMENTS - end = find_chars_or_comment(value, NULL); - if (*end) - *end = '\0'; -#endif - rstrip(value); - - /* Valid name[=:]value pair found, call handler */ - strncpy0(prev_name, name, sizeof(prev_name)); - if (!handler(user, section, name, value) && !error) - error = lineno; - } - else if (!error) { - /* No '=' or ':' found on name[=:]value line */ - error = lineno; - } - } - -#if INI_STOP_ON_FIRST_ERROR - if (error) - break; -#endif - } - -#if !INI_USE_STACK - free(line); -#endif - - return error; -} - -/* See documentation in header file. */ -inline int ini_parse_file(FILE* file, ini_handler handler, void* user) -{ - return ini_parse_stream((ini_reader)fgets, file, handler, user); -} - -/* See documentation in header file. */ -inline int ini_parse(const char* filename, ini_handler handler, void* user) -{ - FILE* file; - int error; - - file = fopen(filename, "r"); - if (!file) - return -1; - error = ini_parse_file(file, handler, user); - fclose(file); - return error; -} - -#endif /* __INI_H__ */ - - -#ifndef __INIREADER_H__ -#define __INIREADER_H__ - -#include -#include -#include - -// Read an INI file into easy-to-access name/value pairs. (Note that I've gone -// for simplicity here rather than speed, but it should be pretty decent.) -class INIReader -{ -public: - // Empty Constructor - INIReader() {}; - - // Construct INIReader and parse given filename. See ini.h for more info - // about the parsing. - INIReader(std::string filename); - - // Construct INIReader and parse given file. See ini.h for more info - // about the parsing. - INIReader(FILE *file); - ~INIReader(); - // Return the result of ini_parse(), i.e., 0 on success, line number of - // first error on parse error, or -1 on file open error. - int ParseError() const; - - // Return the list of sections found in ini file - const std::set& Sections() const; - - // Get a string value from INI file, returning default_value if not found. - std::string Get(std::string section, std::string name, - std::string default_value) const; - std::string Get(std::string section, std::string name) const; - - // Get an integer (long) value from INI file, returning default_value if - // not found or not a valid integer (decimal "1234", "-1234", or hex "0x4d2"). - long GetInteger(std::string section, std::string name, long default_value) const; - long GetInteger(std::string section, std::string name) const; - - // Get a real (floating point double) value from INI file, returning - // default_value if not found or not a valid floating point value - // according to strtod(). - double GetReal(std::string section, std::string name, double default_value) const; - - // Get a single precision floating point number value from INI file, returning - // default_value if not found or not a valid floating point value - // according to strtof(). - float GetFloat(std::string section, std::string name, float default_value) const; - float GetFloat(std::string section, std::string name) const; - - // Get a boolean value from INI file, returning default_value if not found or if - // not a valid true/false value. Valid true values are "true", "yes", "on", "1", - // and valid false values are "false", "no", "off", "0" (not case sensitive). - bool GetBoolean(std::string section, std::string name, bool default_value) const; - -protected: - int _error; - std::map _values; - std::set _sections; - static std::string MakeKey(std::string section, std::string name); - static int ValueHandler(void* user, const char* section, const char* name, - const char* value); -}; - -#endif // __INIREADER_H__ - - -#ifndef __INIREADER__ -#define __INIREADER__ - -#include -#include -#include - -inline INIReader::INIReader(std::string filename) -{ - _error = ini_parse(filename.c_str(), ValueHandler, this); -} - -inline INIReader::INIReader(FILE *file) -{ - _error = ini_parse_file(file, ValueHandler, this); -} - -inline int INIReader::ParseError() const -{ - return _error; -} - -inline INIReader::~INIReader() { } - -inline const std::set& INIReader::Sections() const -{ - return _sections; -} - -inline std::string INIReader::Get(std::string section, std::string name, std::string default_value) const -{ - std::string key = MakeKey(section, name); - return _values.count(key) ? _values.at(key) : default_value; -} - -inline std::string INIReader::Get(std::string section, std::string name) const -{ - std::string key = MakeKey(section, name); - if(_values.count(key)) return _values.at(key); - else - { - printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str()); - exit(-1); - } -} - -inline long INIReader::GetInteger(std::string section, std::string name, long default_value) const -{ - std::string valstr = Get(section, name, ""); - const char* value = valstr.c_str(); - char* end; - // This parses "1234" (decimal) and also "0x4D2" (hex) - long n = strtol(value, &end, 0); - return end > value ? n : default_value; -} - -inline long INIReader::GetInteger(std::string section, std::string name) const -{ - std::string valstr = Get(section, name, ""); - const char* value = valstr.c_str(); - char* end; - // This parses "1234" (decimal) and also "0x4D2" (hex) - long n = strtol(value, &end, 0); - if(end <= value) - { - printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str()); - exit(-1); - } - return n; -} - -inline double INIReader::GetReal(std::string section, std::string name, double default_value) const -{ - std::string valstr = Get(section, name, ""); - const char* value = valstr.c_str(); - char* end; - double n = strtod(value, &end); - return end > value ? n : default_value; -} - -inline float INIReader::GetFloat(std::string section, std::string name, float default_value) const -{ - std::string valstr = Get(section, name, ""); - const char* value = valstr.c_str(); - char* end; - float n = strtof(value, &end); - return end > value ? n : default_value; -} - -inline float INIReader::GetFloat(std::string section, std::string name) const -{ - std::string valstr = Get(section, name, ""); - const char* value = valstr.c_str(); - char* end; - float n = strtof(value, &end); - if(end <= value) - { - printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str()); - exit(-1); - } - return n; -} - -inline bool INIReader::GetBoolean(std::string section, std::string name, bool default_value) const -{ - std::string valstr = Get(section, name, ""); - // Convert to lower case to make string comparisons case-insensitive - std::transform(valstr.begin(), valstr.end(), valstr.begin(), ::tolower); - if (valstr == "true" || valstr == "yes" || valstr == "on" || valstr == "1") - return true; - else if (valstr == "false" || valstr == "no" || valstr == "off" || valstr == "0") - return false; - else - return default_value; -} - -inline std::string INIReader::MakeKey(std::string section, std::string name) -{ - std::string key = section + "=" + name; - // Convert to lower case to make section/name lookups case-insensitive - std::transform(key.begin(), key.end(), key.begin(), ::tolower); - return key; -} - -inline int INIReader::ValueHandler(void* user, const char* section, const char* name, - const char* value) -{ - INIReader* reader = (INIReader*)user; - std::string key = MakeKey(section, name); - if (reader->_values[key].size() > 0) - reader->_values[key] += "\n"; - reader->_values[key] += value; - reader->_sections.insert(section); - return 1; -} - -#endif // __INIREADER__ diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d5abdad22..4e996d1855 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,15 @@ if (BUILD_TEST) set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include) endif() +FetchContent_Declare( + yaml-cpp + GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git + GIT_TAG 0.8.0 +) +set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp") +FetchContent_MakeAvailable(yaml-cpp) + + option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF) option(BUILD_FAST_MATH "Build in fast math mode" ON) diff --git a/README.md b/README.md index a0db34e369..e26d120a71 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) | -[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started/get_started.html) | [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) English | [简体中文](README_zh-CN.md) | [日本語](README_ja.md) @@ -180,7 +180,7 @@ pip install lmdeploy ``` The default prebuilt package is compiled on **CUDA 12** since v0.3.0. -For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md). +For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](docs/en/get_started/installation.md). ## Offline Batch Inference @@ -200,7 +200,7 @@ For more information about inference pipeline, please refer to [here](docs/en/ll # Tutorials -Please review [getting_started](./docs/en/get_started.md) section for the basic usage of LMDeploy. +Please review [getting_started](docs/en/get_started/get_started.md) section for the basic usage of LMDeploy. For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/): diff --git a/README_ja.md b/README_ja.md index 94e3eb7b6c..9313397435 100644 --- a/README_ja.md +++ b/README_ja.md @@ -8,7 +8,7 @@ [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) | -[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started/get_started.html) | [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) [English](README.md) | [简体中文](README_zh-CN.md) | 日本語 @@ -181,7 +181,7 @@ pip install lmdeploy ``` v0.3.0から、デフォルトの事前構築済みパッケージはCUDA 12でコンパイルされています。 -CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/installation.md)参照してください。 +CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/get_started/installation.md)参照してください。 ## オフラインバッチ推論 @@ -201,7 +201,7 @@ print(response) # チュートリアル -LMDeployの基本的な使用方法については、[getting_started](./docs/en/get_started.md)セクションを参照してください。 +LMDeployの基本的な使用方法については、[getting_started](docs/en/get_started/get_started.md)セクションを参照してください。 詳細なユーザーガイドと高度なガイドについては、[チュートリアル](https://lmdeploy.readthedocs.io/en/latest/)を参照してください: diff --git a/README_zh-CN.md b/README_zh-CN.md index 79d551e3e3..7332241676 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -8,7 +8,7 @@ [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) | -[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) | +[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started/get_started.html) | [🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose) [English](README.md) | 简体中文 | [日本語](README_ja.md) @@ -180,7 +180,7 @@ conda activate lmdeploy pip install lmdeploy ``` -自 v0.3.0 起,LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,或者源码安装 LMDeploy,请参考[安装文档](./docs/zh_cn/installation.md) +自 v0.3.0 起,LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,或者源码安装 LMDeploy,请参考[安装文档](docs/zh_cn/get_started/installation.md) ## 离线批处理 @@ -200,7 +200,7 @@ print(response) # 用户教程 -请阅读[快速上手](./docs/zh_cn/get_started.md)章节,了解 LMDeploy 的基本用法。 +请阅读[快速上手](docs/zh_cn/get_started/get_started.md)章节,了解 LMDeploy 的基本用法。 为了帮助用户更进一步了解 LMDeploy,我们准备了用户指南和进阶指南,请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/): diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py index 5ac0335660..761cf0302b 100644 --- a/autotest/benchmark/test_apiserver_performance.py +++ b/autotest/benchmark/test_apiserver_performance.py @@ -88,13 +88,13 @@ def test_restful_tp4(config, run_id, prepare_environment, worker_id): @pytest.mark.function @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('prepare_environment', [{ - 'model': 'internlm/internlm2-chat-20b', + 'model': 'internlm/internlm2_5-20b-chat', 'backend': 'pytorch', 'tp_num': 2, 'extra': '--max-batch-size 256 --cache-max-entry-count 0.9', 'cuda_prefix': None }, { - 'model': 'internlm/internlm2-chat-20b-inner-4bits', + 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'backend': 'turbomind', 'quant_policy': 0, 'tp_num': 2, @@ -106,7 +106,8 @@ def test_restful_func_tp2(config, run_id, prepare_environment, worker_id): result, restful_log, msg = restful_test(config, run_id, prepare_environment, - worker_id=worker_id) + worker_id=worker_id, + is_smoke=True) if restful_log is not None: allure.attach.file(restful_log, diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py index 62a9d53baf..cffdc53270 100644 --- a/autotest/benchmark/test_generation_performance.py +++ b/autotest/benchmark/test_generation_performance.py @@ -117,11 +117,11 @@ def test_generation_longtext_tp4(config, run_id, run_config, worker_id): @pytest.mark.function @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', [{ - 'model': 'internlm/internlm2-chat-20b', + 'model': 'internlm/internlm2_5-20b-chat', 'backend': 'pytorch', 'tp_num': 2 }, { - 'model': 'internlm/internlm2-chat-20b-inner-4bits', + 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'backend': 'turbomind', 'quant_policy': 0, 'tp_num': 2 diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py index fe2422c3f9..ad44b22b43 100644 --- a/autotest/benchmark/test_throughput_performance.py +++ b/autotest/benchmark/test_throughput_performance.py @@ -62,11 +62,11 @@ def test_throughput_tp4(config, run_id, run_config, worker_id): @pytest.mark.function @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', [{ - 'model': 'internlm/internlm2-chat-20b', + 'model': 'internlm/internlm2_5-20b-chat', 'backend': 'pytorch', 'tp_num': 2 }, { - 'model': 'internlm/internlm2-chat-20b-inner-4bits', + 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'backend': 'turbomind', 'quant_policy': 0, 'tp_num': 2 @@ -77,7 +77,8 @@ def test_throughput_func_tp2(config, run_id, run_config, worker_id): run_id, run_config, cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) + worker_id=worker_id, + is_smoke=True) if throughput_log is not None: allure.attach.file(throughput_log, diff --git a/autotest/config.yaml b/autotest/config.yaml index 71786ffdbb..b7c928909a 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -1,7 +1,7 @@ model_path: /nvme/qa_test_models dst_path: /nvme/qa_test_models/autotest_model log_path: /nvme/qa_test_models/autotest_model/log -benchmark_path: /nvme/qa_test_models/benchmark_reports +benchmark_path: /nvme/qa_test_models/benchmark-reports dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json tp_config: @@ -16,7 +16,6 @@ tp_config: Meta-Llama-3-1-70B-Instruct: 4 internlm2_5-7b-chat-1m: 4 Qwen2-7B-Instruct-GPTQ-Int4: 2 - InternVL2-40B: 2 turbomind_chat_model: @@ -45,6 +44,7 @@ turbomind_chat_model: - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-4B-Chat-AWQ - Qwen/Qwen-VL-Chat + - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mistral-7B-Instruct-v0.3 @@ -104,6 +104,7 @@ turbomind_base_model: pytorch_base_model: - tiiuae/falcon-7b + - internlm/internlm2_5-7b - internlm/internlm2_5-1_8b - internlm/internlm2-20b @@ -160,8 +161,9 @@ turbomind_quatization: - baichuan-inc/Baichuan2-7B-Chat - codellama/CodeLlama-7b-hf - openbmb/MiniCPM-Llama3-V-2_5 + - THUDM/glm-4-9b-chat gptq: - - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 + - internlm/internlm2_5-7b-chat kvint: - meta-llama/Meta-Llama-3-1-8B-Instruct - meta-llama/Meta-Llama-3-8B-Instruct @@ -205,9 +207,11 @@ pytorch_quatization: - internlm/internlm2_5-20b-chat - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b + - OpenGVLab/InternVL-Chat-V1-5 - 01-ai/Yi-6B-Chat - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2-1.5B-Instruct + - microsoft/Phi-3-mini-4k-instruct w8a8: - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Llama-2-7b-chat-hf diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index a90e7775f5..9ee793a895 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -17,7 +17,7 @@ TurbomindEngineConfig, pipeline) -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_prompt(config, model, backend, worker_id): @@ -47,7 +47,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_prompt_stream(config, model, backend, worker_id): @@ -79,7 +79,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_multi_prompt(config, model, backend, worker_id): @@ -109,7 +109,7 @@ def run_pipeline_testcase_with_prompt(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_multi_prompt_stream(config, model, backend, worker_id): @@ -141,7 +141,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message(config, model, backend, worker_id): @@ -172,7 +172,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_stream(config, model, backend, worker_id): @@ -204,7 +204,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_batch(config, model, backend, worker_id): @@ -241,7 +241,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_batch_stream(config, model, backend, worker_id): @@ -279,7 +279,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_return_check_logprobs(config, model, backend, worker_id): @@ -309,7 +309,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_return_check_logprobs_stream(config, model, backend, worker_id): @@ -343,7 +343,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_backend_config_session_len(config, model, backend, worker_id): @@ -377,7 +377,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_min_new_tokens(config, model, backend, worker_id): @@ -413,7 +413,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_stop_words(config, model, backend, worker_id): @@ -435,7 +435,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): result &= ' and' not in response[ i].text and ' to ' not in response[i].text result &= response[i].finish_reason == 'stop' and response[ - i].generate_token_len < 20 + i].generate_token_len < 50 save_pipeline_common_log(config, file_name, result, response) del pipe torch.cuda.empty_cache() @@ -454,7 +454,7 @@ def run_pipeline_testcase_stop_words(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_bad_words(config, model, backend, worker_id): @@ -492,7 +492,7 @@ def run_pipeline_testcase_bad_words(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_special_words_false(config, model, backend, worker_id): @@ -533,7 +533,7 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_special_words_true(config, model, backend, worker_id): @@ -574,7 +574,7 @@ def run_pipeline_testcase_special_words(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimum_repetition_penalty(config, model, backend, @@ -590,8 +590,9 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, gen_config = GenerationConfig(repetition_penalty=0.01, random_seed=1) response = pipe('Shanghai is', gen_config=gen_config) - result = 'a 上海 is a 上海, ' * 10 in response.text or get_repeat_times( - response.text, 'Shanghai is') > 5 + result = get_repeat_times(response.text, + 'is a name') > 5 or get_repeat_times( + response.text, 'Shanghai is') > 5 save_pipeline_common_log(config, file_name, result, response) del pipe torch.cuda.empty_cache() @@ -610,7 +611,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend, @@ -644,7 +645,7 @@ def run_pipeline_testcase_repetition_penalty(config, model, backend, del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimun_topp(config, model, backend, worker_id): @@ -676,7 +677,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimun_topk(config, model, backend, worker_id): @@ -711,7 +712,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_diff_random_seed(config, model, backend, worker_id): @@ -747,7 +748,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_same_random_seed(config, model, backend, worker_id): @@ -781,7 +782,7 @@ def run_pipeline_testcase(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_max_new_tokens(config, model, backend, worker_id): @@ -819,7 +820,7 @@ def run_pipeline_testcase_max_new_tokens(config, model, backend, del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_ignore_eos(config, model, backend, worker_id): @@ -856,7 +857,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_backend_config_input_validation(config, model, backend, worker_id): @@ -896,7 +897,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_validate_turbomind(config, model, backend, worker_id): if 'gw' in worker_id: @@ -936,7 +937,7 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [PytorchEngineConfig]) def test_backend_config_validate_pytorch(config, model, backend, worker_id): if 'gw' in worker_id: @@ -967,7 +968,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id): del os.environ['CUDA_VISIBLE_DEVICES'] -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_tp(config, model, backend, worker_id): with pytest.raises(AssertionError, match='tp should be 2\\^n'): diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py index 8e1f183905..88b8a2847e 100644 --- a/autotest/interface/pipeline/test_pipeline_longtext_func.py +++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py @@ -19,7 +19,7 @@ @pytest.mark.gpu_num_1 @pytest.mark.parametrize('model', [ 'internlm/internlm2-chat-7b', 'internlm/internlm2_5-7b', - 'internlm/internlm2-chat-1_8b', 'internlm/internlm2-1_8b' + 'internlm/internlm2-chat-1_8b' ]) def test_history_issue_tp1(config, model, worker_id): log_name = ''.join(['pipeline_longtext_issue_', worker_id, '.log']) @@ -147,7 +147,6 @@ def passkey_retrival(config, tp=tp_num) else: backend_config = TurbomindEngineConfig(session_len=session_len, - use_logn_attn=True, tp=tp_num) else: if 'internlm2_5' in model and '-1m' in model: diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py index 3e2ca31664..83e52d83c9 100644 --- a/autotest/interface/restful/test_restful_completions_v1.py +++ b/autotest/interface/restful/test_restful_completions_v1.py @@ -188,3 +188,14 @@ def test_completions_stream_stopwords(self): assert output_last.get('choices')[0].get('finish_reason') in [ 'stop', 'length' ] + + def test_batch_prompt_order(self): + api_client = APIClient(BASE_URL) + model_name = api_client.available_models[0] + for item in api_client.completions_v1( + model=model_name, + prompt=['你好', '今天天气怎么样', '你是谁', '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'], + max_tokens=200): + assert '天气' in item.get('choices')[1].get('text') + assert '梅' in item.get('choices')[3].get('text') + assert '7' in item.get('choices')[4].get('text') diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 3ee8608604..642f87ec28 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -56,7 +56,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id): @pytest.mark.hf_pytorch_chat @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) def test_hf_pytorch_chat_pr(config, model, cli_case_config): usercase = 'chat_testcase' result, chat_log, msg = hf_command_line_test( diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 5f5e1fde59..2f13898fec 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -106,9 +106,10 @@ def test_hf_turbomind_base_tp2(config, model, cli_case_config, worker_id): @pytest.mark.hf_turbomind_chat @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize( - 'model', - ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'internlm/internlm2_5-20b-chat', + 'internlm/internlm2_5-20b-chat-inner-4bits' +]) def test_hf_turbomind_chat_pr(config, model, cli_case_config): usercase = 'chat_testcase' diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py index ee7b2ddc47..a16d4e32f6 100644 --- a/autotest/tools/chat/test_command_chat_workspace.py +++ b/autotest/tools/chat/test_command_chat_workspace.py @@ -97,9 +97,10 @@ def test_workspace_base_tp2(config, cli_case_config, model, worker_id): @pytest.mark.command_chat @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize( - 'model', - ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'internlm/internlm2_5-20b-chat', + 'internlm/internlm2_5-20b-chat-inner-4bits' +]) def test_workspace_chat_pr(config, cli_case_config, model): usercase = 'chat_testcase' result, chat_log, msg = command_line_test( diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py index a8f0859275..9d194f1ea3 100644 --- a/autotest/tools/convert/test_convert.py +++ b/autotest/tools/convert/test_convert.py @@ -22,9 +22,10 @@ def test_convert(config, model, worker_id): @pytest.mark.convert @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize( - 'model', - ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'internlm/internlm2_5-20b-chat', + 'internlm/internlm2_5-20b-chat-inner-4bits' +]) def test_convert_pr(config, model): convert(config, model, 'CUDA_VISIBLE_DEVICES=5') diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py index 778ff73c7e..8f56225ebc 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py @@ -63,7 +63,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model, @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b']) +@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) def test_pipeline_chat_pytorch_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'pytorch')) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py index 6373549698..d92af06ecb 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py @@ -100,9 +100,10 @@ def test_pipeline_chat_kvint_tp2(config, common_case_config, model, @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_2 @pytest.mark.pr_test -@pytest.mark.parametrize( - 'model', - ['internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-inner-4bits']) +@pytest.mark.parametrize('model', [ + 'internlm/internlm2_5-20b-chat', + 'internlm/internlm2_5-20b-chat-inner-4bits' +]) def test_pipeline_chat_pr(config, common_case_config, model): p = Process(target=run_pipeline_chat_test, args=(config, common_case_config, model, 'turbomind')) diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index a4c6b26043..aaaabbc6f3 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -11,7 +11,17 @@ @pytest.mark.timeout(900) @pytest.mark.parametrize('model', get_quantization_model_list('awq')) def test_quantization_awq(config, model, worker_id): - quantization_awq(config, model + '-inner-4bits', model, + quantization_type = 'awq' + quantization_all(config, model + '-inner-4bits', model, quantization_type, + get_cuda_prefix_by_workerid(worker_id)) + + +@pytest.mark.order(3) +@pytest.mark.timeout(900) +@pytest.mark.parametrize('model', get_quantization_model_list('gptq')) +def test_quantization_gptq(config, model, worker_id): + quantization_type = 'gptq' + quantization_all(config, model + '-inner-gptq', model, quantization_type, get_cuda_prefix_by_workerid(worker_id)) @@ -22,14 +32,15 @@ def test_quantization_awq(config, model, worker_id): @pytest.mark.timeout(900) @pytest.mark.parametrize( 'model, prefix', - [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')]) + [('internlm/internlm2_5-20b-chat', 'CUDA_VISIBLE_DEVICES=5')]) def test_quantization_awq_pr(config, model, prefix): - quantization_awq(config, model + '-inner-4bits', model, prefix) + quantization_type = 'awq' + quantization_all(config, model + '-inner-4bits', model, quantization_type, + prefix) -def quantization_awq(config, quantization_model_name, origin_model_name, - cuda_prefix): - quantization_type = 'awq' +def quantization_all(config, quantization_model_name, origin_model_name, + quantization_type, cuda_prefix): result, msg = quantization(config, quantization_model_name, origin_model_name, quantization_type, cuda_prefix) diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind.py index 4046cf38c2..c9fade16a4 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind.py @@ -117,11 +117,11 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.pr_test @pytest.mark.parametrize('prepare_environment', [{ - 'model': 'internlm/internlm2-chat-20b', + 'model': 'internlm/internlm2_5-20b-chat', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2 }, { - 'model': 'internlm/internlm2-chat-20b-inner-4bits', + 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2 }], diff --git a/autotest/tools/restful/test_restful_chat_workspace.py b/autotest/tools/restful/test_restful_chat_workspace.py index 17205a9d95..798a43d7b0 100644 --- a/autotest/tools/restful/test_restful_chat_workspace.py +++ b/autotest/tools/restful/test_restful_chat_workspace.py @@ -69,11 +69,11 @@ def test_restful_chat_tp2(config, common_case_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.pr_test @pytest.mark.parametrize('prepare_environment', [{ - 'model': 'internlm/internlm2-chat-20b', + 'model': 'internlm/internlm2_5-20b-chat', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2 }, { - 'model': 'internlm/internlm2-chat-20b-inner-4bits', + 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', 'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6', 'tp_num': 2 }], diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 12aa260c37..9356f40a3b 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -165,7 +165,7 @@ def restful_test(config, command = f'python3 benchmark/profile_restful_api.py localhost:{port} {model_path} {dataset_path} --stream-output True ' # noqa: F401, E501 if is_smoke: - command += ' --num-prompts 300' + command += ' --num-prompts 200' else: command += ' --num-prompts 2000' diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 539280c7ee..1cc556748e 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -13,8 +13,9 @@ def get_turbomind_model_list(tp_num: int = None, for key in quatization_case_config.get('awq'): if key in case_list: case_list.append(key + '-inner-4bits') - if model_type == 'chat_model': - case_list += quatization_case_config.get('gptq') + for key in quatization_case_config.get('gptq'): + if key in case_list: + case_list.append(key + '-inner-gptq') if tp_num is not None: return [ @@ -54,13 +55,11 @@ def get_all_model_list(tp_num: int = None, model_type: str = 'chat_model'): turbomind_quantization_config = config.get('turbomind_quatization') pytorch_quantization_config = config.get('pytorch_quatization') for key in turbomind_quantization_config.get( - 'awq') + pytorch_quantization_config.get('awq'): + 'awq') + pytorch_quantization_config.get( + 'awq') + turbomind_quantization_config.get('gptq'): if key in case_list and key + '-inner-4bits' not in case_list: case_list.append(key + '-inner-4bits') - if model_type == 'chat_model': - case_list += turbomind_quantization_config.get('gptq') - if tp_num is not None: return [ item for item in case_list if get_tp_num(config, item) == tp_num @@ -85,6 +84,9 @@ def get_kvint_model_list(tp_num: int = None, model_type: str = 'chat_model'): for key in config.get('turbomind_quatization').get('awq'): if key in case_list_base and key in case_list: case_list.append(key + '-inner-4bits') + for key in config.get('turbomind_quatization').get('gptq'): + if key in case_list_base and key in case_list: + case_list.append(key + '-inner-gptq') if tp_num is not None: return [ @@ -104,6 +106,8 @@ def get_quantization_model_list(type): return case_list if type == 'kvint': return config.get('turbomind_quatization').get(type) + if type == 'gptq': + return config.get('turbomind_quatization').get(type) if type == 'w8a8': return config.get('pytorch_quatization').get(type) return [] diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py index 153a854bb7..75b7319aeb 100644 --- a/autotest/utils/quantization_utils.py +++ b/autotest/utils/quantization_utils.py @@ -23,13 +23,18 @@ def quantization(config, cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path, '--work-dir', quantization_model_path, '--batch-size 32' ]) + elif quantization_type == 'gptq': + quantization_cmd = ' '.join([ + cuda_prefix, 'lmdeploy lite auto_gptq', origin_model_path, + '--work-dir', quantization_model_path, '--batch-size 32' + ]) elif quantization_type == 'w8a8': quantization_cmd = ' '.join([ cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path, '--work-dir', quantization_model_path, '--batch-size 32' ]) else: - return False, 'quantization type should in [awq, w8a8], \ + return False, 'quantization type should in [awq, gptq, w8a8], \ now the type is ' + quantization_type if 'llama-3' in origin_model_name.lower(): diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index b6dd6a5f48..edc2268e30 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -104,7 +104,7 @@ def command_test(config, file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n') spliter = '\n\n' - if 'codellama' in model.lower() and ' chat ' in cmd: + if 'codellama' in model.lower() and 'serve' not in ' '.join(cmd): spliter = '\n!!\n' # join prompt together prompt = '' diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index 81de3dbf45..89c07fb196 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -16,7 +16,7 @@ from tqdm import tqdm from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter -from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig, +from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig) from lmdeploy.utils import get_logger @@ -25,7 +25,7 @@ def infer(model, session_id: int, input_ids: List, - gen_config: EngineGenerationConfig, test_round: int, que: Queue): + gen_config: GenerationConfig, test_round: int, que: Queue): if session_id == 1: pbar = tqdm(total=test_round) chatbot = model.create_instance() @@ -73,7 +73,7 @@ def infer(model, session_id: int, input_ids: List, def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int, - gen_config: EngineGenerationConfig): + gen_config: GenerationConfig): if not warmup_round: return @@ -110,7 +110,7 @@ def _infer(model, session_id): def profile_throughput(model_path: str, concurrency: int, input_seqlen: int, engine_config: Union[PytorchEngineConfig, TurbomindEngineConfig], - gen_config: EngineGenerationConfig, test_round: int, + gen_config: GenerationConfig, test_round: int, warmup_round: int): output_seqlen = gen_config.max_new_tokens print(f'profiling ... concurrency: {concurrency}, ' @@ -424,12 +424,11 @@ def main(): thread_safe=True, enable_prefix_caching=args.enable_prefix_caching, ) - gen_config = EngineGenerationConfig( - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - max_new_tokens=completion_tokens, - ignore_eos=True) + gen_config = GenerationConfig(top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + max_new_tokens=completion_tokens, + ignore_eos=True) profile_target = partial( profile_throughput, concurrency=batch, diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index be7c1035e8..23fa317810 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -13,7 +13,7 @@ from tqdm import tqdm from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter -from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig, +from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig) from lmdeploy.pytorch.engine import EngineInstance from lmdeploy.tokenizer import DetokenizeState, Tokenizer @@ -105,12 +105,11 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, for outputs in model_inst.stream_infer( session_id, input_ids=input_ids, - gen_config=EngineGenerationConfig( - max_new_tokens=output_seqlen, - temperature=temperature, - top_p=top_p, - top_k=top_k, - ignore_eos=True), + gen_config=GenerationConfig(max_new_tokens=output_seqlen, + temperature=temperature, + top_p=top_p, + top_k=top_k, + ignore_eos=True), sequence_start=True, sequence_end=True, stream_output=stream_output): diff --git a/docker/Dockerfile_aarch64_ascend b/docker/Dockerfile_aarch64_ascend new file mode 100644 index 0000000000..058ec6a905 --- /dev/null +++ b/docker/Dockerfile_aarch64_ascend @@ -0,0 +1,171 @@ +FROM ubuntu:20.04 as export_image + +WORKDIR /tmp + +ARG http_proxy +ARG https_proxy +ARG PYVERSION=3.10.5 +ARG DEBIAN_FRONTEND=noninteractive +ARG CHIP=all +ARG ASCEND_BASE=/usr/local/Ascend +ARG TOOLKIT_PKG=Ascend-cann-toolkit_*.run +ARG KERNELS_PKG=Ascend-cann-kernels-*.run +ARG TOOLKIT_PATH=$ASCEND_BASE/ascend-toolkit/latest +ARG DEEPLINK_TAG_OR_COMMIT=6012186b03cff6eac6587e7a06dbaa590af6d5df +ARG DEEPLINKEXT_TAG_OR_COMMIT=525678f2c4c227e1e8bf358259a19a578b67bc37 +ARG LMDEPLOY_TAG_OR_COMMIT=v0.6.0a0 + +RUN sed -i 's@http://.*.ubuntu.com@http://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list && \ + apt update && \ + apt install --no-install-recommends ca-certificates -y && \ + apt install --no-install-recommends bc wget -y && \ + apt install --no-install-recommends curl gcc make g++ pkg-config unzip -y && \ + apt install --no-install-recommends libsqlite3-dev libblas3 liblapack3 gfortran vim -y && \ + apt install --no-install-recommends liblapack-dev libblas-dev libhdf5-dev libffi-dev -y && \ + apt install --no-install-recommends libssl-dev zlib1g-dev xz-utils cython3 python3-h5py -y && \ + apt install --no-install-recommends libopenblas-dev libgmpxx4ldbl liblzma-dev -y && \ + apt install --no-install-recommends libicu66 libxml2 pciutils libgl1-mesa-glx libbz2-dev -y && \ + apt install --no-install-recommends libreadline-dev libncurses5 libncurses5-dev libncursesw5 -y && \ + apt install --no-install-recommends git gdb gcc-7 g++-7 -y && \ + sed -i 's@http://mirrors.tuna.tsinghua.edu.cn@https://mirrors.tuna.tsinghua.edu.cn@g' /etc/apt/sources.list && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 7 \ + --slave /usr/bin/g++ g++ /usr/bin/g++-7 --slave /usr/bin/gcov gcov /usr/bin/gcov-7 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9 \ + --slave /usr/bin/g++ g++ /usr/bin/g++-9 --slave /usr/bin/gcov gcov /usr/bin/gcov-9 && \ + update-alternatives --set gcc $(update-alternatives --list gcc | grep gcc-7) + +ENV LD_LIBRARY_PATH=/usr/local/python${PYVERSION}/lib: \ + PATH=/usr/local/python${PYVERSION}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +RUN umask 0022 && \ + wget https://repo.huaweicloud.com/python/${PYVERSION}/Python-${PYVERSION}.tar.xz && \ + tar -xf Python-${PYVERSION}.tar.xz && cd Python-${PYVERSION} && ./configure --prefix=/usr/local/python${PYVERSION} --enable-shared && \ + make -j 16 && make install && \ + ln -sf /usr/local/python${PYVERSION}/bin/python3 /usr/bin/python3 && \ + ln -sf /usr/local/python${PYVERSION}/bin/python3 /usr/bin/python && \ + ln -sf /usr/local/python${PYVERSION}/bin/pip3 /usr/bin/pip3 && \ + ln -sf /usr/local/python${PYVERSION}/bin/pip3 /usr/bin/pip && \ + cd .. && \ + rm -rf Python* && \ + mkdir -p ~/.pip && \ + echo '[global] \n\ + index-url=http://mirrors.aliyun.com/pypi/simple\n\ + trusted-host=mirrors.aliyun.com' >> ~/.pip/pip.conf && \ + pip3 install pip -U + +RUN pip3 install -U pip && \ + pip3 install wheel==0.43.0 scikit-build==0.18.0 numpy==1.24 setuptools==69.5.1 && \ + pip3 install decorator sympy cffi && \ + pip3 install cmake ninja pyyaml && \ + pip3 install pathlib2 protobuf attrs attr scipy && \ + pip3 install requests psutil absl-py && \ + pip3 install torch==2.1.1 torchvision==0.16.1 --index-url=https://download.pytorch.org/whl/cpu && \ + pip3 install transformers==4.41.0 && \ + rm -rf /root/.cache/pip + +ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH +ENV LD_PRELOAD=/lib/aarch64-linux-gnu/libGLdispatch.so.0:$LD_PRELOAD + +RUN if [ ! -d "/lib64" ]; \ + then \ + mkdir /lib64 && ln -sf /lib/ld-linux-aarch64.so.1 /lib64/ld-linux-aarch64.so.1; \ + fi + +FROM ubuntu:20.04 as buildtemp +COPY ./*.run /tmp + +FROM export_image + +ENV LD_LIBRARY_PATH=\ +$ASCEND_BASE/driver/lib64:\ +$ASCEND_BASE/driver/lib64/common:\ +$ASCEND_BASE/driver/lib64/driver:\ +$ASCEND_BASE/driver/tools/hccn_tool/:\ +$TOOLKIT_PATH/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64/:\ +$LD_LIBRARY_PATH + +RUN --mount=type=cache,target=/tmp,from=buildtemp,source=/tmp \ + umask 0022 && \ + mkdir -p $ASCEND_BASE/driver && \ + if [ "$CHIP" != "all" ]; \ + then \ + CHIPOPTION="--chip=$CHIP"; \ + else \ + CHIPOPTION=""; \ + fi && \ + chmod +x $TOOLKIT_PKG $KERNELS_PKG && \ + ./$TOOLKIT_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all $CHIPOPTION && \ + ./$KERNELS_PKG --quiet --install --install-path=$ASCEND_BASE --install-for-all && \ + rm -f $TOOLKIT_PKG $KERNELS_PKG + +ENV GLOG_v=2 \ + LD_LIBRARY_PATH=$TOOLKIT_PATH/lib64:$LD_LIBRARY_PATH \ + TBE_IMPL_PATH=$TOOLKIT_PATH/opp/op_impl/built-in/ai_core/tbe \ + PATH=$TOOLKIT_PATH/ccec_compiler/bin:$PATH \ + ASCEND_OPP_PATH=$TOOLKIT_PATH/opp \ + ASCEND_AICPU_PATH=$TOOLKIT_PATH + +ENV PYTHONPATH=$TBE_IMPL_PATH:$PYTHONPATH + +RUN rm -rf ./* + +SHELL ["/bin/bash", "-c"] +RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc && \ + . ~/.bashrc + +WORKDIR /deeplink +RUN echo -e "diff --git a/impl/ascend_npu/CMakeLists.txt b/impl/ascend_npu/CMakeLists.txt\n\ +index e684c59..f1cd8d4 100755\n\ +--- a/impl/ascend_npu/CMakeLists.txt\n\ ++++ b/impl/ascend_npu/CMakeLists.txt\n\ +@@ -14,6 +14,11 @@ FetchContent_Declare(op_plugin\n\ + FetchContent_MakeAvailable(op_plugin)\n\ + message(STATUS \"op-plugin download done\")\n\ + \n\ ++add_custom_target(patch_op_plugin_code\n\ ++ COMMAND sed -i 's/GetOpApiLibHandler\(GetCustOpApiLibName\(\)\)/nullptr/' \${op_plugin_SOURCE_DIR}/op_plugin/utils/op_api_common.h\n\ ++ BYPRODUCTS \${op_plugin_SOURCE_DIR}/op_plugin/utils/op_api_common.h\n\ ++)\n\ ++\n\ + add_custom_target(op_plugin_gen\n\ + COMMAND cd \${op_plugin_SOURCE_DIR} && bash ./gencode.sh 2.1 python\n\ + BYPRODUCTS \${op_plugin_SOURCE_DIR}/op_plugin/OpInterface.h \${op_plugin_SOURCE_DIR}/op_plugin/OpInterface.cpp\n\ +@@ -253,7 +258,7 @@ endif()\n\ + set(THIRD_PARTY_INCLUDE_DIRS \${CMAKE_CURRENT_SOURCE_DIR}/../third_party/half/include)\n\ + \n\ + add_library(\${DEVICEIMPL} SHARED \${IMPL_SRC})\n\ +-add_dependencies(\${DEVICEIMPL} op_plugin_gen)\n\ ++add_dependencies(\${DEVICEIMPL} op_plugin_gen patch_op_plugin_code)\n\ + set_target_properties(\${DEVICEIMPL} PROPERTIES SUFFIX \".so\")\n\ + target_include_directories(\${DEVICEIMPL} PRIVATE \${ASCEND_DIR}/ascend-toolkit/latest/include/aclnn)\n\ + target_include_directories(\${DEVICEIMPL} SYSTEM PUBLIC \${THIRD_PARTY_INCLUDE_DIRS})\n" > /deeplink/warning.patch + +# deeplink +RUN git clone https://github.com/DeepLink-org/deeplink.framework.git && \ + cd deeplink.framework/dipu && \ + git checkout ${DEEPLINK_TAG_OR_COMMIT} && \ + git submodule update --init --recursive && \ + git -C ./third_party/DIOPI apply /deeplink/warning.patch && \ + DIPU_DEVICE=ascend python setup.py develop && \ + rm -rf /root/.cache/pip + +# deeplink_ext +RUN git clone https://github.com/DeepLink-org/DeepLinkExt.git && \ + cd DeepLinkExt && \ + git checkout ${DEEPLINKEXT_TAG_OR_COMMIT} && \ + DIPU_REPO=/deeplink/deeplink.framework/dipu DIPU_ROOT=${DIPU_REPO}/torch_dipu \ + DIOPI_PATH=${DIPU_REPO}/third_party/DIOPI/proto \ + VENDOR_INCLUDE_DIRS=/usr/local/Ascend/ascend-toolkit/latest/include \ + pip install -vv --no-build-isolation -e . && \ + rm -rf /root/.cache/pip + +# lmdeploy +WORKDIR /workspace +RUN git clone https://github.com/InternLM/lmdeploy.git && \ + cd lmdeploy && \ + git checkout ${LMDEPLOY_TAG_OR_COMMIT} && \ + sed -i '/triton/d' requirements/runtime.txt && \ + pip install -vv --no-build-isolation -e . && \ + rm -rf /root/.cache/pip diff --git a/docs/en/advance/debug_turbomind.md b/docs/en/advance/debug_turbomind.md index c4c7b32f7f..91733ce2a5 100644 --- a/docs/en/advance/debug_turbomind.md +++ b/docs/en/advance/debug_turbomind.md @@ -4,7 +4,7 @@ Turbomind is implemented in C++, which is not as easy to debug as Python. This d ## Prerequisite -First, complete the local compilation according to the commands in [Install from source](../installation.md). +First, complete the local compilation according to the commands in [Install from source](../get_started/installation.md). ## Configure Python debug environment diff --git a/docs/en/advance/structed_output.md b/docs/en/advance/structed_output.md new file mode 100644 index 0000000000..b4b3cd7dfd --- /dev/null +++ b/docs/en/advance/structed_output.md @@ -0,0 +1,106 @@ +# Structured output + +Currently, only the Pytorch backend has this capability. Therefore, whether you are using the pipeline or the api_server, please specify the use of the Pytorch backend. + +## pipeline + +```python +from lmdeploy import pipeline +from lmdeploy.messages import GenerationConfig, PytorchEngineConfig + +model = 'internlm/internlm2-chat-1_8b' +guide = { + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string' + }, + 'skills': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 10 + }, + 'minItems': 3 + }, + 'work history': { + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'company': { + 'type': 'string' + }, + 'duration': { + 'type': 'string' + } + }, + 'required': ['company'] + } + } + }, + 'required': ['name', 'skills', 'work history'] +} +pipe = pipeline(model, backend_config=PytorchEngineConfig(), log_level='INFO') +gen_config = GenerationConfig( + response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide))) +response = pipe(['Make a self introduction please.'], gen_config=gen_config) +print(response) +``` + +## api_server + +Firstly, start the api_server service for the InternLM2 model. + +```shell +lmdeploy serve api_server internlm/internlm2-chat-1_8b --backend pytorch +``` + +The client can test using OpenAI’s python package: The output result is a response in JSON format. + +```python +from openai import OpenAI +guide = { + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string' + }, + 'skills': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 10 + }, + 'minItems': 3 + }, + 'work history': { + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'company': { + 'type': 'string' + }, + 'duration': { + 'type': 'string' + } + }, + 'required': ['company'] + } + } + }, + 'required': ['name', 'skills', 'work history'] +} +response_format=dict(type='json_schema', json_schema=dict(name='test',schema=guide)) +messages = [{'role': 'user', 'content': 'Make a self-introduction please.'}] +client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1') +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=0.8, + response_format=response_format, + top_p=0.8) +print(response) +``` diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md index 574b9ed506..04468bea2f 100644 --- a/docs/en/benchmark/evaluate_with_opencompass.md +++ b/docs/en/benchmark/evaluate_with_opencompass.md @@ -8,7 +8,7 @@ In this part, we are going to setup the environment for evaluation. ### Install lmdeploy -Please follow the [installation guide](../installation.md) to install lmdeploy. +Please follow the [installation guide](../get_started/installation.md) to install lmdeploy. ### Install OpenCompass diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md new file mode 100644 index 0000000000..eeb1371ea0 --- /dev/null +++ b/docs/en/get_started/ascend/get_started.md @@ -0,0 +1,117 @@ +# Get Started with Huawei Ascend (Atlas 800T A2) + +The usage of lmdeploy on a Huawei Ascend device is almost the same as its usage on CUDA with PytorchEngine in lmdeploy. +Please read the original [Get Started](../get_started.md) guide before reading this tutorial. + +## Installation + +### Environment Preparation + +#### Drivers and Firmware + +The host machine needs to install the Huawei driver and firmware version 23.0.3, refer to +[CANN Driver and Firmware Installation](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html) +and [download resources](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha). + +#### CANN + +File `docker/Dockerfile_aarch64_ascend` does not provide Ascend CANN installation package, users need to download the CANN (version 8.0.RC3.alpha001) software packages from [Ascend Resource Download Center](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001) themselves. And place the Ascend-cann-kernels-910b\*.run and Ascend-cann-toolkit\*-aarch64.run under the directory where the docker build command is executed. + +#### Docker + +Building the aarch64_ascend image requires Docker >= 18.03 + +#### Reference Command for Building the Image + +The following reference command for building the image is based on the lmdeploy source code root directory as the current directory, and the CANN-related installation packages are also placed under this directory. + +```bash +DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:v0.1 \ +    -f docker/Dockerfile_aarch64_ascend . +``` + +This image will install lmdeploy to `/workspace/lmdeploy` directory using `pip install --no-build-isolation -e .` command. + +#### Using the Image + +You can refer to the [documentation](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/clusterscheduling/dockerruntimeug/dlruntime_ug_013.html) +for usage. It is recommended to install Ascend Docker Runtime. +Here is an example of starting container for Huawei Ascend device with Ascend Docker Runtime installed: + +```bash +docker run -e ASCEND_VISIBLE_DEVICES=0 --net host -td --entry-point bash --name lmdeploy_ascend_demo \ +    lmdeploy-aarch64-ascend:v0.1  # docker_image_sha_or_name +``` + +#### Pip install + +If you have lmdeploy installed and all Huawei environments are ready, you can run the following command to enable lmdeploy to run on Huawei Ascend devices. (Not necessary if you use the Docker image.) + +```bash +pip install dlinfer-ascend +``` + +## Offline batch inference + +### LLM inference + +Set `device_type="ascend"`  in the `PytorchEngineConfig`: + +```python +from lmdeploy import pipeline +from lmdeploy import PytorchEngineConfig +if __name__ == "__main__": +    pipe = pipeline("internlm/internlm2_5-7b-chat", +     backend_config = PytorchEngineConfig(tp=1, device_type="ascend")) +    question = ["Shanghai is", "Please introduce China", "How are you?"] +    response = pipe(question) +    print(response) +``` + +### VLM inference + +Set `device_type="ascend"` in the `PytorchEngineConfig`: + +```python +from lmdeploy import pipeline, PytorchEngineConfig +from lmdeploy.vl import load_image +if __name__ == "__main__": + pipe = pipeline('OpenGVLab/InternVL2-2B', +     backend_config=PytorchEngineConfig(tp=1, device_type='ascend')) +    image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +    response = pipe(('describe this image', image)) +    print(response) +``` + +## Online serving + +### Serve a LLM model + +Add `--device ascend` in the serve command. + +```bash +lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat +``` + +### Serve a VLM model + +Add `--device ascend` in the serve command + +```bash +lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B +``` + +## Inference with Command line Interface + +Add `--device ascend` in the serve command. + +```bash +lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend +``` + +Run the following commands to launch lmdeploy chatting after starting container: + +```bash +docker exec -it lmdeploy_ascend_demo \ +    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat" +``` diff --git a/docs/en/get_started.md b/docs/en/get_started/get_started.md similarity index 94% rename from docs/en/get_started.md rename to docs/en/get_started/get_started.md index 311980536f..8650858d12 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started/get_started.md @@ -1,6 +1,6 @@ # Quick Start -This tutorial shows the usage of LMDeploy on: +This tutorial shows the usage of LMDeploy on CUDA platform: - Offline inference of LLM model and VLM model - Serve a LLM or VLM model by the OpenAI compatible server @@ -19,7 +19,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is']) print(response) ``` -When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](supported_models/supported_models.md), with the TurboMind Engine taking precedence by default. +When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](../supported_models/supported_models.md), with the TurboMind Engine taking precedence by default. However, you have the option to manually select an engine. For instance, @@ -74,7 +74,7 @@ response = pipe(prompts, In the `GenerationConfig`, `top_k=1` or `temperature=0.0` indicates greedy search. -For more information about pipeline, please read the [detailed tutorial](llm/pipeline.md) +For more information about pipeline, please read the [detailed tutorial](../llm/pipeline.md) ### VLM inference @@ -110,7 +110,7 @@ print(response) However, the larger the image batch size, the greater risk of an OOM error, because the LLM component within the VLM model pre-allocates a massive amount of memory in advance. -We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models matrix](./supported_models/supported_models.md). +We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models matrix](../supported_models/supported_models.md). Additionally, follow the instructions in [LLM Inference](#llm-inference) section to reduce the values of memory-related parameters ## Serving @@ -147,7 +147,7 @@ response = client.chat.completions.create( print(response) ``` -We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](./llm/api_server.md), [function calls](llm/api_server_tools.md) and other topics +We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](../llm/api_server.md), [function calls](../llm/api_server_tools.md) and other topics ### Serve a VLM model diff --git a/docs/en/get_started/index.rst b/docs/en/get_started/index.rst new file mode 100644 index 0000000000..4343ee9ab1 --- /dev/null +++ b/docs/en/get_started/index.rst @@ -0,0 +1,8 @@ +On Other Platforms +================================= + +.. toctree:: + :maxdepth: 1 + :caption: NPU(Huawei) + + ascend/get_started.md diff --git a/docs/en/installation.md b/docs/en/get_started/installation.md similarity index 100% rename from docs/en/installation.md rename to docs/en/get_started/installation.md diff --git a/docs/en/index.rst b/docs/en/index.rst index 5f5f3420dc..5d49e01c86 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -38,11 +38,12 @@ Documentation .. _get_started: .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Get Started - installation.md - get_started.md + get_started/installation.md + get_started/get_started.md + get_started/index.rst .. _supported_models: .. toctree:: @@ -101,6 +102,7 @@ Documentation advance/long_context.md advance/chat_template.md advance/debug_turbomind.md + advance/structed_output.md .. toctree:: :maxdepth: 1 diff --git a/docs/en/llm/qos.md b/docs/en/llm/qos.md deleted file mode 100644 index 221679e775..0000000000 --- a/docs/en/llm/qos.md +++ /dev/null @@ -1,219 +0,0 @@ -## LMDeploy-QoS Introduce and Usage - -### Background - -With the rise of Large Language Model (LLM) and Artificial General Intelligence (AGI), numerous inference frameworks have emerged. These frameworks deliver scalable and high-performance services by serving online workloads with language models. However, these workloads often come from multiple user groups, exhibiting rapid changes in workload patterns within short periods. Many inference frameworks struggle to meet the demands of such multi-tenancy traffic patterns and fail to effectively shape user behaviors. Therefore, we believe that systematically considering these issues in LLM inference framework is both valuable and necessary. - -### User Categorizations for Multi-tenancy Handling - -LMDeploy-QoS is part of LMDeploy, offering a range of multi-tenancy functionalities. It requires users to tag their inference requests with appropriate user identifications (user_id in configuration or codebase). The system operates based on a dictionary-like configuration that serves as a multi-tenancy policy. In this configuration, users are mapped to different classes, known as "user groups", each configured with a ratio value. Our multi-tenancy strategy reads this configuration and schedules user inference requests according to class priority and the difference between the predefined ratio and real-time allocation ratio. Extensive testing shows that LMDeploy-QoS significantly enhances LLM serving reliability and GPU resource utilization for real-world large language model inference workloads. - -We categorize LMDeploy users into four groups: - -- Platinum -- Gold -- Silver -- Bronze - -Based on our experiences in delivering LLM services, we can map the following four types of users to these user groups: - -- Platinum: VIP or administrative users. Examples include service inspectors or product demo presenters who require uninterrupted online services. Their workloads are typically at a low frequency and require limited resources. - -- Gold: Contracted business user groups requiring specific quantities of reliable services. For instance, Company A signs a contract with the LLM service provider to secure X requests/sec service capability with Z% availability for its employees at the cost of Y million dollars per year. - -- Silver: The vast majority of users fall under this category. Most trial or monthly subscribed users are included in this group. They need a relatively small quantity of services, but their user experiences significantly affect the LLM service reputation. - -- Bronze: Heavy users who pay minimal fees to LLM providers. - -The above user group categorization is intended for guidance rather than as a recommendation for all LMDeploy users, as it may not be suitable for all LLM service providers. Users can develop their own method of categorizing users based on their observations of daily workloads. - -Next, we will discuss how LMDeploy schedules requests based on these categorizations. - -### Multi-tenancy Strategies - -#### Strategy 1: prioritized scheduling between groups - -This strategy works as simple as its title suggests. - -User groups are introduced for this strategy, with users in each group to be specified. Recommended user groups are as follows: - -- Platinum -- Gold -- Silver -- Bronze - -The priority of each group decreases sequentially. Requests with higher priority are always given precedence for inference. Be noted that the scheduling is performed at the time of request reception, so lower-priority requests will not be withdrawn from the GPU if they are already under inference. - -The below diagram shows how the prioritization works. As you can see, the platinum request is reprioritized and moved to the queue head. - -![](https://github.com/InternLM/lmdeploy/assets/52888924/9d63f081-7168-4c74-8456-24f0a4b41649) - -#### Strategy 2: proportionally rated scheduling with a pre-defined ratio within user group - -This strategy works only within the user group. We introduce a within-group user quota configuration table. This table defines users' "ideal share ratio" with a sum value of 100% GPU resource. Each "user" appears in the list as a user_id, and a user can only belong to one user group. Requests from different users will be scheduled according to each user's "ideal share ratio". To be specific, users with their real-time usage ratio lower than their quota ratio will have priority over users whose real-time usage ratio is higher than their quota ratio. It is worth noting that the scheduling only considers users in the request queue, ignoring any absent users from the configuration table. - -The below diagram shows a typical example of how this strategy works. - -![](https://github.com/InternLM/lmdeploy/assets/52888924/3e1d7135-6b11-4998-89a1-b72af6c962c3) - -#### Strategy 3: a combination strategy of 1 and 2 - -We can call it a hybrid strategy. The way we hybrid these 2 strategies is fairly simple: we adopt strategy 1 in between user groups, and adopt strategy 2 within a user group. So users belonging to different groups with different priorities will only obey strategy 1 to determine their privilege in resource allocation. That is, when both strategies are applied, the first strategy will overpower the second. When it comes to a situation that no cross-group requests are waiting for serving, the within-group strategy 2 comes into play. - -Below is a diagram showing it. - -![](https://github.com/InternLM/lmdeploy/assets/52888924/e335f976-ff15-48db-b1ff-abf1c3327d6e) - -To be noted, there could be other ways of hybrid strategies 1 & 2, and this doc only introduces one method that works well in our scenario. Considering that prioritization and pro-rated sharing are obviously conflicting strategies, there is no easy way to mix them to work within a single dimension. - -### A Sample QoS Configuration - -The configuration will be specified by the `--qos-config-path` flag, and will be loaded by program upon startup. - -```json -{ - "enable_user_qos": true, - "user_groups": [ - "Platinum", - "Gold", - "Silver", - "Bronze" - ], - "user_group_map": { - "Platinum": [ - { - "id": "user_id0", - "quota_pct": 100 - }, - { - "id": "default", - "quota_pct": 0 - } - ], - "Gold": [ - { - "id": "user_id1", - "quota_pct": 50 - }, - { - "id": "user_id2", - "quota_pct": 50 - } - ], - "Silver": [ - { - "id": "user_id3", - "quota_pct": 5 - }, - { - "id": "default", - "quota_pct": 95 - } - ], - "Bronze": [ - { - "id": "user_id4", - "quota_pct": 30 - }, - { - "id": "user_id5", - "quota_pct": 30 - }, - { - "id": "user_id6", - "quota_pct": 40 - }, - { - "id": "default", - "quota_pct": 0 - } - ] - } -} -``` - -### How to perform inference job with Lmdeploy-QoS aware - -We provide the code link below to show how to call infer requests with multi-tenancy strategy awarded. What the qos related argument appears as in http body: - -/v1/chat/interactive_qos - -```bash -curl -X POST http://localhost/v1/chat/interactive_qos \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello,Hello", - "session_id": -1, - "interactive_mode": false, - "stream": false, - "stop": false, - "request_output_len": 512, - "top_p": 0.8, - "top_k": 40, - "temperature": 0.8, - "repetition_penalty": 1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -/v1/chat/completions_qos - -```bash -curl -X POST http://localhost/v1/chat/completions_qos \ - -H "Content-Type: application/json" \ - -d '{ - "model": "internlm-chat-7b", - "messages": "Hello,Hello", - "temperature": 0.7, - "top_p": 1, - "n": 1, - "max_tokens": 512, - "stop": false, - "stream": false, - "presence_penalty": 0, - "frequency_penalty": 0, - "repetition_penalty": 1, - "session_id": -1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -/v1/completions_qos - -```bash -curl -X POST http://localhost/v1/completions_qos \ - -H "Content-Type: application/json" \ - -d '{ - "model": "internlm-chat-7b", - "prompt": "Hello,Hello", - "suffix": "string", - "temperature": 0.7, - "n": 1, - "max_tokens": 16, - "stop": "string", - "stream": false, - "top_p": 1, - "repetition_penalty": 1, - "session_id": -1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -### File Configuration Modification - -The template of the configuration file is located at: `lmdeploy/server/qos_engine/qos_config.json.template`. Add the necessary users based on actual requirements, ensure correct priority assignment, and set appropriate quota values. - -### Passing Configuration Parameters - -Upon starting the api_server, pass the configuration file and its path using the `--qos-config-path` flag. An example is illustrated below: - -```bash -CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm-chat-7b --server-port 8000 --qos-config-path lmdeploy/serve/qos_engine/qos_config.json.template -``` - -### Contributor - -[Eric](https://github.com/rhinouser0), [sallyjunjun](https://github.com/sallyjunjun), [sfireworks](https://github.com/sfireworks), [Dofgal](https://github.com/Dofgal), [shadow](https://github.com/awslshadowstar) diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md index d2114e574c..6673e3105b 100644 --- a/docs/en/multi_modal/cogvlm.md +++ b/docs/en/multi_modal/cogvlm.md @@ -17,7 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https: pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121 ``` -Install LMDeploy by following the [installation guide](../installation.md) +Install LMDeploy by following the [installation guide](../get_started/installation.md) ### Prepare diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md index 8f0f81387d..efa2b30a26 100644 --- a/docs/en/multi_modal/internvl.md +++ b/docs/en/multi_modal/internvl.md @@ -13,7 +13,7 @@ The next chapter demonstrates how to deploy an InternVL model using LMDeploy, wi ## Installation -Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternVL2 needs +Please install LMDeploy by following the [installation guide](../get_started/installation.md), and install other packages that InternVL2 needs ```shell pip install timm diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md index 9283fc1435..15774de7e7 100644 --- a/docs/en/multi_modal/minicpmv.md +++ b/docs/en/multi_modal/minicpmv.md @@ -11,7 +11,7 @@ The next chapter demonstrates how to deploy an MiniCPM-V model using LMDeploy, w ## Installation -Please install LMDeploy by following the [installation guide](../installation.md). +Please install LMDeploy by following the [installation guide](../get_started/installation.md). ## Offline inference diff --git a/docs/en/multi_modal/phi3.md b/docs/en/multi_modal/phi3.md index a801618b35..a7ad0237e2 100644 --- a/docs/en/multi_modal/phi3.md +++ b/docs/en/multi_modal/phi3.md @@ -13,7 +13,7 @@ The next chapter demonstrates how to deploy an Phi-3 model using LMDeploy, with ## Installation -Please install LMDeploy by following the [installation guide](../installation.md) and install the dependency [Flash-Attention](https://github.com/Dao-AILab/flash-attention) +Please install LMDeploy by following the [installation guide](../get_started/installation.md) and install the dependency [Flash-Attention](https://github.com/Dao-AILab/flash-attention) ```shell # It is recommended to find the whl package that matches the environment from the releases on https://github.com/Dao-AILab/flash-attention. diff --git a/docs/en/multi_modal/xcomposer2d5.md b/docs/en/multi_modal/xcomposer2d5.md index d6883c0023..2f56b65ea1 100644 --- a/docs/en/multi_modal/xcomposer2d5.md +++ b/docs/en/multi_modal/xcomposer2d5.md @@ -8,7 +8,7 @@ ### Installation -Please install LMDeploy by following the [installation guide](../installation.md), and install other packages that InternLM-XComposer-2.5 needs +Please install LMDeploy by following the [installation guide](../get_started/installation.md), and install other packages that InternLM-XComposer-2.5 needs ```shell pip install decord diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md index a9e4e5e12e..3adaf7a750 100644 --- a/docs/en/quantization/w4a16.md +++ b/docs/en/quantization/w4a16.md @@ -9,7 +9,7 @@ The following NVIDIA GPUs are available for AWQ/GPTQ INT4 inference: - Ampere(sm80,sm86): 30 series, A10, A16, A30, A100 - Ada Lovelace(sm89): 40 series -Before proceeding with the quantization and inference, please ensure that lmdeploy is installed by following the [installation guide](../installation.md) +Before proceeding with the quantization and inference, please ensure that lmdeploy is installed by following the [installation guide](../get_started/installation.md) The remainder of this article is structured into the following sections: diff --git a/docs/zh_cn/advance/debug_turbomind.md b/docs/zh_cn/advance/debug_turbomind.md index cb95c6ef4d..3c3b75421d 100644 --- a/docs/zh_cn/advance/debug_turbomind.md +++ b/docs/zh_cn/advance/debug_turbomind.md @@ -4,7 +4,7 @@ Turbomind 使用 C++ 实现,不像 Python 一样易于调试。该文档提供 ## 前置工作 -首先,根据构建[命令](../installation.md)完成源码编译和安装。 +首先,根据构建[命令](../get_started/installation.md)完成源码编译和安装。 ## 配置 Python 调试环境 diff --git a/docs/zh_cn/advance/structed_output.md b/docs/zh_cn/advance/structed_output.md new file mode 100644 index 0000000000..9f8e9c6cc4 --- /dev/null +++ b/docs/zh_cn/advance/structed_output.md @@ -0,0 +1,108 @@ +# 结构化输出 + +目前只有 Pytorch 后端具有该能力。所以无论是使用 pipline 还是使用 api_server,请指定使用 pytorch 后端。 + +## pipeline + +```python +from lmdeploy import pipeline +from lmdeploy.messages import GenerationConfig, PytorchEngineConfig + +model = 'internlm/internlm2-chat-1_8b' +guide = { + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string' + }, + 'skills': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 10 + }, + 'minItems': 3 + }, + 'work history': { + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'company': { + 'type': 'string' + }, + 'duration': { + 'type': 'string' + } + }, + 'required': ['company'] + } + } + }, + 'required': ['name', 'skills', 'work history'] +} +pipe = pipeline(model, backend_config=PytorchEngineConfig(), log_level='INFO') +gen_config = GenerationConfig( + response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide))) +response = pipe(['Make a self introduction please.'], gen_config=gen_config) +print(response) +``` + +## api_server + +首先,先启动 InternLM2 模型的 api_server 服务。 + +```shell +lmdeploy serve api_server internlm/internlm2-chat-1_8b --backend pytorch +``` + +客户端可以使用 OpenAI 的 python 包进行测试: + +```python +from openai import OpenAI +guide = { + 'type': 'object', + 'properties': { + 'name': { + 'type': 'string' + }, + 'skills': { + 'type': 'array', + 'items': { + 'type': 'string', + 'maxLength': 10 + }, + 'minItems': 3 + }, + 'work history': { + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'company': { + 'type': 'string' + }, + 'duration': { + 'type': 'string' + } + }, + 'required': ['company'] + } + } + }, + 'required': ['name', 'skills', 'work history'] +} +response_format=dict(type='json_schema', json_schema=dict(name='test',schema=guide)) +messages = [{'role': 'user', 'content': 'Make a self-introduction please.'}] +client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1') +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=0.8, + response_format=response_format, + top_p=0.8) +print(response) +``` + +输出结果是一个 json 格式的回答。 diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md index 94ba5326bb..f4480718a8 100644 --- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md +++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md @@ -8,7 +8,7 @@ LMDeploy设计了TurboMind推理引擎用来加速大模型推理,其推理精 ### 安装 lmdeploy -请参考[安装指南](../installation.md)安装 lmdeploy +请参考[安装指南](../get_started/installation.md)安装 lmdeploy ### 安装 OpenCompass diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md new file mode 100644 index 0000000000..01626e49d6 --- /dev/null +++ b/docs/zh_cn/get_started/ascend/get_started.md @@ -0,0 +1,119 @@ +# 华为昇腾(Atlas 800T A2) + +我们采用了LMDeploy中的PytorchEngine后端支持了华为昇腾设备, +所以在华为昇腾上使用lmdeploy的方法与在英伟达GPU上使用PytorchEngine后端的使用方法几乎相同。 +在阅读本教程之前,请先阅读原版的[快速开始](../get_started.md)。 + +## 安装 + +### 环境准备 + +#### Drivers和Firmware + +Host需要安装华为驱动程序和固件版本23.0.3,请参考 +[CANN 驱动程序和固件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha003/softwareinst/instg/instg_0019.html) +和[下载资源](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.RC3.alpha001&driver=1.0.0.2.alpha)。 + +#### CANN + +`docker/Dockerfile_aarch64_ascend`没有提供CANN 安装包,用户需要自己从[昇腾资源下载中心](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.alpha001)下载CANN(8.0.RC3.alpha001)软件包。 +并将Ascend-cann-kernels-910b\*.run 和 Ascend-cann-toolkit\*-aarch64.run 放在执行`docker build`命令的目录下。 + +#### Docker + +构建aarch64_ascend镜像需要Docker>=18.03 + +#### 构建镜像的命令 + +请在lmdeploy源代码根目录下执行以下镜像构建命令,CANN相关的安装包也放在此目录下。 + +```bash +DOCKER_BUILDKIT=1 docker build -t lmdeploy-aarch64-ascend:v0.1 \ +    -f docker/Dockerfile_aarch64_ascend . +``` + +这个镜像将使用`pip install --no-build-isolation -e .`命令将lmdeploy安装到/workspace/lmdeploy目录。 + +#### 镜像的使用 + +关于镜像的使用方式,请参考这篇[文档](https://www.hiascend.com/document/detail/zh/mindx-dl/60rc1/clusterscheduling/dockerruntimeug/dlruntime_ug_013.html)。 +并且在使用镜像前安装Ascend Docker Runtime。 +以下是在安装了 Ascend Docker Runtime 的情况下,启动用于华为昇腾设备的容器的示例: + +```bash +docker run -e ASCEND_VISIBLE_DEVICES=0 --net host -td --entry-point bash --name lmdeploy_ascend_demo \ +    lmdeploy-aarch64-ascend:v0.1  # docker_image_sha_or_name +``` + +#### 使用Pip安装 + +如果您已经安装了lmdeploy并且所有华为环境都已准备好,您可以运行以下命令使lmdeploy能够在华为昇腾设备上运行。(如果使用Docker镜像则不需要) + +```bash +pip install dlinfer-ascend +``` + +## 离线批处理 + +### LLM 推理 + +将`device_type="ascend"`加入`PytorchEngineConfig`的参数中。 + +```python +from lmdeploy import pipeline +from lmdeploy import PytorchEngineConfig +if __name__ == "__main__": +    pipe = pipeline("internlm/internlm2_5-7b-chat", +     backend_config = PytorchEngineConfig(tp=1, device_type="ascend")) +    question = ["Shanghai is", "Please introduce China", "How are you?"] +    response = pipe(question) +    print(response) +``` + +### VLM 推理 + +将`device_type="ascend"`加入`PytorchEngineConfig`的参数中。 + +```python +from lmdeploy import pipeline, PytorchEngineConfig +from lmdeploy.vl import load_image +if __name__ == "__main__": +    pipe = pipeline('OpenGVLab/InternVL2-2B', + backend_config=PytorchEngineConfig(tp=1, device_type='ascend')) +    image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +    response = pipe(('describe this image', image)) +    print(response) +``` + +## 在线服务 + +### LLM 模型服务 + +将`--device ascend`加入到服务启动命令中。 + +```bash +lmdeploy serve api_server --backend pytorch --device ascend internlm/internlm2_5-7b-chat +``` + +### VLM 模型服务 + +将`--device ascend`加入到服务启动命令中。 + +```bash +lmdeploy serve api_server --backend pytorch --device ascend OpenGVLab/InternVL2-2B +``` + +## 使用命令行与LLM模型对话 + +将`--device ascend`加入到服务启动命令中。 + +```bash +lmdeploy chat internlm/internlm2_5-7b-chat --backend pytorch --device ascend +``` + +也可以运行以下命令使启动容器后开启lmdeploy聊天 + +```bash +docker exec -it lmdeploy_ascend_demo \ +    bash -i -c "lmdeploy chat --backend pytorch --device ascend internlm/internlm2_5-7b-chat" +``` diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started/get_started.md similarity index 95% rename from docs/zh_cn/get_started.md rename to docs/zh_cn/get_started/get_started.md index 5649397a8f..51d5f0ff81 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started/get_started.md @@ -21,7 +21,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -在构造 `pipeline` 时,如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理,LMDeploy 将根据[它们各自的能力](supported_models/supported_models.md)自动分配一个,默认优先使用 TurboMind 引擎。 +在构造 `pipeline` 时,如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理,LMDeploy 将根据[它们各自的能力](../supported_models/supported_models.md)自动分配一个,默认优先使用 TurboMind 引擎。 然而,你可以选择手动选择一个引擎。例如, @@ -73,7 +73,7 @@ response = pipe(prompts, 在 `GenerationConfig` 中,`top_k=1` 或 `temperature=0.0` 表示贪心搜索。 -有关 pipeline 的更多信息,请参考[这里](llm/pipeline.md) +有关 pipeline 的更多信息,请参考[这里](../llm/pipeline.md) ### VLM 推理 @@ -144,7 +144,7 @@ response = client.chat.completions.create( print(response) ``` -我们鼓励你参考详细指南,了解关于[使用 Docker 部署服务](./llm/api_server.md)、[工具调用](llm/api_server_tools.md)和其他更多功能的信息。 +我们鼓励你参考详细指南,了解关于[使用 Docker 部署服务](../llm/api_server.md)、[工具调用](../llm/api_server_tools.md)和其他更多功能的信息。 ### VLM 模型服务 diff --git a/docs/zh_cn/get_started/index.rst b/docs/zh_cn/get_started/index.rst new file mode 100644 index 0000000000..35affc13ce --- /dev/null +++ b/docs/zh_cn/get_started/index.rst @@ -0,0 +1,8 @@ +其他软硬件平台 +================================= + +.. toctree:: + :maxdepth: 1 + :caption: NPU(Huawei) + + ascend/get_started.md diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/get_started/installation.md similarity index 100% rename from docs/zh_cn/installation.md rename to docs/zh_cn/get_started/installation.md diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 262f970ce0..018a00487f 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -41,8 +41,9 @@ LMDeploy 工具箱提供以下核心功能: :maxdepth: 2 :caption: 快速上手 - installation.md - get_started.md + get_started/installation.md + get_started/get_started.md + get_started/index.rst .. _支持的模型: .. toctree:: @@ -102,6 +103,7 @@ LMDeploy 工具箱提供以下核心功能: advance/long_context.md advance/chat_template.md advance/debug_turbomind.md + advance/structed_output.md .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/llm/qos.md b/docs/zh_cn/llm/qos.md deleted file mode 100644 index 10b717fd74..0000000000 --- a/docs/zh_cn/llm/qos.md +++ /dev/null @@ -1,225 +0,0 @@ -## LMDeploy-QoS 介绍与用法 - -### 背景 - -在过去一段时间,推理框架伴随着LLM和AGI出现。许多推理框架为语言模型提供可扩展和高性能的在线工作负载服务。它们的工作负载通常涉及多个用户群体,而且工作负载在短时间内快速变化。许多推理框架在满足这些多租户流量模式的要求方面存在困难,而且未能很好的规范约束用户的行为,所以我们认为在LLM推理框架考虑多用户负载均衡是很有必要的。 - -### 多租户处理的用户分类 - -LMDeploy-QoS与LMDeploy 提供一系列多租户功能。它要求用户使用适当的用户标识(配置文件或代码库中的user_id)标记其推理请求。它是基于字典的配置作为多租户策略。在这个配置中,用户被映射到不同“用户组”中,并配备一个使用配额。我们的多租户策略可以读取配置,并根据其用户组的优先级和预定义配额与实时分配比率之间的差异安排用户推理请求的调度。经过完备的测试,我们的LMDeploy-QoS模块极大地提高了LLM的服务可靠性并提升了大型语言模型推理工作的GPU资源利用率。 - -LMDeploy将用户分为4组: - -- 白金(Platinum) -- 金(Gold) -- 银(Silver) -- 青铜(Bronze) - -根据我们在提供LLM服务方面的使用经验,我们可以将以下4种类型的用户映射到这些用户组中: - -- Platinum : VIP用户或管理员用户。包括需要不间断使用的的服务开发人员或演示人员。他们的工作负载频率低,对推理工作的资源需求也不高。 - -- Gold : 签署定期服务的高级用户,他们需要可衡量的可靠服务。例如,某个公司A与LLM服务提供商签订了合同,购买了每秒X个请求的服务能力,可用性为Z%,供A公司员工使用,年付Y百万美元。 - -- Silver : 绝大多数用户。大多数试用或每月订阅的用户被归类为此类别。他们需要相对较少的服务,但他们的用户体验对于LLM服务的声誉也很重要。 - -- Bronze : 支付很少费用给LLM提供商的重度用户。 - -以上引入用户组分类的目的是为了提供指导,而不是为所有LMDeploy用户提供建议,因为这并不一定适用于所有LLM业务提供商。管理员可以对用户的日常负载进行统计,自行决定如何对用户进行分类。 - -接下来让我们讨论一下LMDeploy如何根据这些分类进行分配请求。 - -### 多租户策略 - -#### 策略 1: 用户组之间的优先级调度 - -我们引入“用户组”概念。由模块使用者来定义哪些用户到用户组的映射(可以理解为 uid 到用户组的映射)。推荐用户组为4组如下: - -- Platinum -- Gold -- Silver -- Bronze - -四个用户组之间的优先级顺序是严格的 Platinum > Gold > Silver > Bronze 。当系统繁忙的时候,我们会优先执行排名靠前的请求。 - -下面的图表显示了优先级处理的工作原理。您可以看到 Platinum 请求已被重新设置优先级并移至队列头部。 - -![](https://github.com/InternLM/lmdeploy/assets/52888924/9d63f081-7168-4c74-8456-24f0a4b41649) - -#### 策略 2: 用户组内均摊与软隔离 - -这个策略仅适用于用户组内部。我们引入了一个用户组内的用户配额配置表。该表定义了用户在 100% GPU 资源中的 “理想份额比例”。每个 “用户” 在列表中以 user_id 的形式出现,并且一个用户只能属于一个用户组。低于配额表上额定值的用户会比高于额定值的用户拥有更高的优先级获得被释放资源而进行更多的推理,直到双方使用量趋近于原始配额比例。此处调度只考虑请求队列中的用户,忽略没有出现在请求队列中的已配置用户。 - -以下图表展示了这种策略的典型示例。 - -![](https://github.com/InternLM/lmdeploy/assets/52888924/3e1d7135-6b11-4998-89a1-b72af6c962c3) - -#### 策略3:混合机制 - -是指在一个系统中优先级+均摊/隔离同时开启。执行顺序是先用户组间优先级,再在组内做均摊/隔离实现。这里略去时序图描写。需要注意的是,用户组间的优先级可以压倒性覆盖组内的决策。例如,当低优先级内部的两个用户互相之间有请求顺序调度时,高优先级的请求一旦抵达,将会覆盖所有低优先级的分配逻辑而有限执行高优任务。 - -![](https://github.com/InternLM/lmdeploy/assets/52888924/e335f976-ff15-48db-b1ff-abf1c3327d6e) - -需要注意的是,混合机制可能有其他方法,本文档只介绍了一种在我们场景下有效的方法。其他混合方法需要考虑到优先级和按比例共享明显是相互冲突的策略,因此没有简单的方法将它们混合在单一维度内工作。 - -### QoS 配置项模板 - -配置文件通过启动参数`--qos-config-path`指定,并由程序在启动时加载。 - -配置会和lmdeploy启动脚本等文件放置在一起。配置内容包含: - -1. QoS的启用开关,设置为True时后续的QoS和用户相关配置才会生效,设置为False后续配置不会生效; - -2. user_groups 是一个列表,包含了多种不同的组间优先级; - -3. user_group_map 的映射配置,包含了用户组优先级,组内用户id以及每个用户组内用户的配额分配。 - -配置项模板如下: - -```json -{ - "enable_user_qos": true, - "user_groups": [ - "Platinum", - "Gold", - "Silver", - "Bronze" - ], - "user_group_map": { - "Platinum": [ - { - "id": "user_id0", - "quota_pct": 100 - }, - { - "id": "default", - "quota_pct": 0 - } - ], - "Gold": [ - { - "id": "user_id1", - "quota_pct": 50 - }, - { - "id": "user_id2", - "quota_pct": 50 - } - ], - "Silver": [ - { - "id": "user_id3", - "quota_pct": 5 - }, - { - "id": "default", - "quota_pct": 95 - } - ], - "Bronze": [ - { - "id": "user_id4", - "quota_pct": 30 - }, - { - "id": "user_id5", - "quota_pct": 30 - }, - { - "id": "user_id6", - "quota_pct": 40 - }, - { - "id": "default", - "quota_pct": 0 - } - ] - } -} -``` - -### 如何使用 LMDeploy-QoS 感知进行推理 - -我们提供以下代码链接,展示如何调用具有多租户策略感知的推理请求,在 HTTP Body 中,与 QoS 相关的参数如下: - -/v1/chat/interactive_qos - -```bash -curl -X POST http://localhost/v1/chat/interactive_qos \ - -H "Content-Type: application/json" \ - -d '{ - "prompt": "Hello,Hello", - "session_id": -1, - "interactive_mode": false, - "stream": false, - "stop": false, - "request_output_len": 512, - "top_p": 0.8, - "top_k": 40, - "temperature": 0.8, - "repetition_penalty": 1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -/v1/chat/completions_qos - -```bash -curl -X POST http://localhost/v1/chat/completions_qos \ - -H "Content-Type: application/json" \ - -d '{ - "model": "internlm-chat-7b", - "messages": "Hello,Hello", - "temperature": 0.7, - "top_p": 1, - "n": 1, - "max_tokens": 512, - "stop": false, - "stream": false, - "presence_penalty": 0, - "frequency_penalty": 0, - "repetition_penalty": 1, - "session_id": -1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -/v1/completions_qos - -```bash -curl -X POST http://localhost/v1/completions_qos \ - -H "Content-Type: application/json" \ - -d '{ - "model": "internlm-chat-7b", - "prompt": "Hello,Hello", - "suffix": "string", - "temperature": 0.7, - "n": 1, - "max_tokens": 16, - "stop": "string", - "stream": false, - "top_p": 1, - "repetition_penalty": 1, - "session_id": -1, - "ignore_eos": false, - "user_id": "user_id0" -}' -``` - -### 配置文件修改 - -配置文件模板路径为:`lmdeploy/server/qos_engine/qos_config.json.template`,可以根据实际需求添加需要配置的用户,设置正确的优先级以及quota值。 - -### 配置参数传入 - -启动api_server时,通过`--qos-config-path`,将配置文件及路径传入,示例如下: - -```bash -CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm-chat-7b --server-port 8000 --qos-config-path lmdeploy/serve/qos_engine/qos_config.json.template -``` - -### 贡献者 - -[Eric](https://github.com/rhinouser0), [sallyjunjun](https://github.com/sallyjunjun), [sfireworks](https://github.com/sfireworks), [Dofgal](https://github.com/Dofgal), [shadow](https://github.com/awslshadowstar) diff --git a/docs/zh_cn/multi_modal/cogvlm.md b/docs/zh_cn/multi_modal/cogvlm.md index 131ad4f4aa..9810e671d4 100644 --- a/docs/zh_cn/multi_modal/cogvlm.md +++ b/docs/zh_cn/multi_modal/cogvlm.md @@ -17,7 +17,7 @@ pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https: pip install torch==2.2.2 torchvision==0.17.2 xformers==0.0.26 --index-url https://download.pytorch.org/whl/cu121 ``` -请参考[安装文档](../installation.md)安装 LMDeploy +请参考[安装文档](../get_started/installation.md)安装 LMDeploy ### 准备 diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md index c51870d6b2..1abcbc7d06 100644 --- a/docs/zh_cn/multi_modal/internvl.md +++ b/docs/zh_cn/multi_modal/internvl.md @@ -13,7 +13,7 @@ LMDeploy 支持 InternVL 系列模型,具体如下: ## 安装 -请参考[安装文档](../installation.md)安装 LMDeploy,并安装上游 InternVL 模型库需的依赖。 +请参考[安装文档](../get_started/installation.md)安装 LMDeploy,并安装上游 InternVL 模型库需的依赖。 ```shell pip install timm diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md index 8b41bd511e..b605bc1fcc 100644 --- a/docs/zh_cn/multi_modal/minicpmv.md +++ b/docs/zh_cn/multi_modal/minicpmv.md @@ -11,7 +11,7 @@ LMDeploy 支持 MiniCPM-V 系列模型,具体如下: ## 安装 -请参考[安装文档](../installation.md)安装 LMDeploy。 +请参考[安装文档](../get_started/installation.md)安装 LMDeploy。 ## 离线推理 diff --git a/docs/zh_cn/multi_modal/phi3.md b/docs/zh_cn/multi_modal/phi3.md index 2ed120344b..b5545d30b6 100644 --- a/docs/zh_cn/multi_modal/phi3.md +++ b/docs/zh_cn/multi_modal/phi3.md @@ -13,7 +13,7 @@ ## 安装 -请参考[安装文档](../installation.md)安装 LMDeploy,并安装该模型的依赖。 +请参考[安装文档](../get_started/installation.md)安装 LMDeploy,并安装该模型的依赖。 ```shell # 建议从https://github.com/Dao-AILab/flash-attention/releases寻找和环境匹配的whl包 diff --git a/docs/zh_cn/multi_modal/xcomposer2d5.md b/docs/zh_cn/multi_modal/xcomposer2d5.md index 31973a4941..033d25c8ac 100644 --- a/docs/zh_cn/multi_modal/xcomposer2d5.md +++ b/docs/zh_cn/multi_modal/xcomposer2d5.md @@ -8,7 +8,7 @@ ### 安装 -请参考[安装文档](../installation.md)安装 LMDeploy,并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。 +请参考[安装文档](../get_started/installation.md)安装 LMDeploy,并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。 ```shell pip install decord diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md index 83237b992d..b61b894781 100644 --- a/docs/zh_cn/quantization/w4a16.md +++ b/docs/zh_cn/quantization/w4a16.md @@ -9,7 +9,7 @@ LMDeploy TurboMind 引擎支持由 [AWQ](https://arxiv.org/abs/2306.00978) 和 [ - Ampere(sm80,sm86): 30 系列,A10, A16, A30, A100 - Ada Lovelace(sm89): 40 系列 -在进行量化和推理之前,请确保按照[安装指南](../installation.md)安装了 lmdeploy。 +在进行量化和推理之前,请确保按照[安装指南](../get_started/installation.md)安装了 lmdeploy。 本文的其余部分由以下章节组成: diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini deleted file mode 100644 index ed07d8038d..0000000000 --- a/examples/cpp/llama/llama_config.ini +++ /dev/null @@ -1,82 +0,0 @@ -[ft_instance_hyperparameter] -data_type=fp16 -enable_custom_all_reduce=0 -pipeline_para_size=1 -tensor_para_size=1 -; update model_dir path according to the actual situation -model_dir=/workspace/models/triton_models/weights/ - - -[request] -request_batch_size=8 -max_input_len=1 -request_output_len=2048 -beam_width=1 ; beam width for beam search -top_k=1 ; k value for top k sampling -top_p=0.0 ; p value for top p sampling -temperature=1.0 ; Use for sampling -repetition_penalty=1.00 ; Use for sampling -presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. -len_penalty=0.0 -beam_search_diversity_rate=0.0 -; PJLM start/end ids -start_id=0 -end_id=1 - - -; --------------------- legacy params ------------------------- - -; LLaMA start/end ids -; start_id=1 -; end_id=2 - -[4999_llama] -head_num=80 -size_per_head=128 -vocab_size=65632 -num_layer=82 -rotary_embedding=128 -norm_eps=1e-5 -start_id=0 -end_id=1 -inter_size=27392 - -[llama_7B] -head_num=32 -size_per_head=128 -vocab_size=32000 -num_layer=32 -rotary_embedding=128 -start_id=1 -end_id=2 -inter_size=11008 - -[llama_13B] -head_num=40 -size_per_head=128 -vocab_size=32000 -num_layer=40 -rotary_embedding=128 -start_id=1 -end_id=2 -inter_size=13824 - -[llama_30B] -head_num=52 -size_per_head=128 -vocab_size=32000 -num_layer=60 -rotary_embedding=128 -start_id=1 -end_id=2 -inter_size=17920 - -[llama_65B] -head_num=64 -size_per_head=128 -vocab_size=32000 -num_layer=80 -rotary_embedding=128 -start_id=1 -end_id=2 -inter_size=22016 diff --git a/examples/cpp/llama/llama_config.yaml b/examples/cpp/llama/llama_config.yaml new file mode 100644 index 0000000000..463614de06 --- /dev/null +++ b/examples/cpp/llama/llama_config.yaml @@ -0,0 +1,24 @@ +ft_instance_hyperparameter: + data_type: fp16 + enable_custom_all_reduce: 0 + pipeline_para_size: 1 + tensor_para_size: 1 + # update model_dir path according to the actual situation + model_dir: /workspace/models/triton_models/weights/ + + +request: + request_batch_size: 8 + max_input_len: 1 + request_output_len: 2048 + beam_width: 1 # beam width for beam search + top_k: 1 # k value for top k sampling + top_p: 0.0 # p value for top p sampling + temperature: 1.0 # Use for sampling + repetition_penalty: 1.00 # Use for sampling + presence_penalty: 0.0 # Only one of repetition_penalty and presence_penalty are allowed. + len_penalty: 0.0 + beam_search_diversity_rate: 0.0 + # PJLM start/end ids + start_id: 0 + end_id: 1 diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc index 11036cf1b7..b0e513410e 100644 --- a/examples/cpp/llama/llama_triton_example.cc +++ b/examples/cpp/llama/llama_triton_example.cc @@ -18,7 +18,7 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc -#include "3rdparty/INIReader.h" +#include #include #include #include @@ -254,20 +254,24 @@ int read_start_ids(size_t batch_size, std::string file_name); std::vector>> -prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector* pointer_record, const std::string& csv_name) +prepareRequest(std::string config_file, const int node_id, const int gpu_count, std::vector* pointer_record, const std::string& csv_name) { - INIReader reader = INIReader(ini_name); - if (reader.ParseError() < 0) { - std::cout << "[ERROR] Can't load '" << ini_name << "'\n"; + YAML::Node reader; + try { + reader = YAML::Load(config_file); + } + catch (const YAML::Exception& e) { + std::cerr << "Error reading YAML config: " << e.what() << std::endl; ft::FT_CHECK(false); } + auto request = reader["request"]; - const size_t request_batch_size = reader.GetInteger("request", "request_batch_size"); + const size_t request_batch_size = request["request_batch_size"].as(); std::cerr << "request_batch_size=" << request_batch_size << "\n"; - const int start_id = reader.GetInteger("request", "start_id"); - const int end_id = reader.GetInteger("request", "end_id"); - const int max_input_len = reader.GetInteger("request", "max_input_len"); + const int start_id = request["start_id"].as(); + const int end_id = request["end_id"].as(); + const int max_input_len = request["max_input_len"].as(); std::vector v_start_ids; std::vector v_start_lengths; @@ -289,16 +293,16 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std std::vector v_bad_words; RequestParam param; - param.beam_width = reader.GetInteger("request", "beam_width"); - param.request_output_len = reader.GetInteger("request", "request_output_len"); - param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate"); - param.runtime_top_k = reader.GetInteger("request", "top_k"); - param.runtime_top_p = reader.GetFloat("request", "top_p"); - param.temperature = reader.GetFloat("request", "temperature"); - param.len_penalty = reader.GetFloat("request", "len_penalty"); - param.repetition_penalty = reader.GetFloat("request", "repetition_penalty", 1.0f); - param.presence_penalty = reader.GetFloat("request", "presence_penalty", 0.0f); - param.min_length = reader.GetInteger("request", "min_length", 0); + param.beam_width = request["beam_width"].as(); + param.request_output_len = request["request_output_len"].as(); + param.beam_search_diversity_rate = request["beam_search_diversity_rate"].as(); + param.runtime_top_k = request["top_k"].as(); + param.runtime_top_p = request["top_p"].as(); + param.temperature = request["temperature"].as(); + param.len_penalty = request["len_penalty"].as(); + param.repetition_penalty = request["repetition_penalty"].as(1.0f); + param.presence_penalty = request["presence_penalty"].as(0.0f); + param.min_length = request["min_length"].as(0); param.random_seed = (unsigned long long int)0; param.start_id = start_id; param.end_id = end_id; @@ -361,11 +365,11 @@ int main(int argc, char* argv[]) // Note: Only supports that all nodes have same gpu count const int gpu_count = ft::getDeviceCount(); const int world_size = node_num * gpu_count; - printf("Recommend to specify the first parameter on the command line as the path to llama_config.ini\n"); - std::string ini_name = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini"; + printf("Recommend to specify the first parameter on the command line as the path to llama_config.yaml\n"); + std::string config_file = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.yaml"; // step 1: Create model - std::shared_ptr model = AbstractTransformerModel::createLlamaModel(ini_name); + std::shared_ptr model = AbstractTransformerModel::createLlamaModel(config_file); int tensor_para_size = model->getTensorParaSize(); int pipeline_para_size = model->getPipelineParaSize(); printf( @@ -406,7 +410,7 @@ int main(int argc, char* argv[]) std::vector pointer_record; // Used to prevent the pointers are // release after leaving functions std::vector>> request_list = - prepareRequest(ini_name, node_id, gpu_count, &pointer_record, csv_name); + prepareRequest(config_file, node_id, gpu_count, &pointer_record, csv_name); printf("[INFO] request is created \n"); // step 5: Forward diff --git a/lmdeploy/__init__.py b/lmdeploy/__init__.py index a9a6ee095a..df64717919 100644 --- a/lmdeploy/__init__.py +++ b/lmdeploy/__init__.py @@ -1,16 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. from .api import client, pipeline, serve -from .messages import (EngineGenerationConfig, GenerationConfig, - PytorchEngineConfig, TurbomindEngineConfig, - VisionConfig) +from .messages import (GenerationConfig, PytorchEngineConfig, + TurbomindEngineConfig, VisionConfig) from .model import ChatTemplateConfig from .tokenizer import Tokenizer from .version import __version__, version_info __all__ = [ 'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig', - 'EngineGenerationConfig', '__version__', 'version_info', - 'ChatTemplateConfig', 'PytorchEngineConfig', 'TurbomindEngineConfig', - 'VisionConfig' + '__version__', 'version_info', 'ChatTemplateConfig', 'PytorchEngineConfig', + 'TurbomindEngineConfig', 'VisionConfig' ] diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py index 64b714765a..50757d50d4 100644 --- a/lmdeploy/archs.py +++ b/lmdeploy/archs.py @@ -157,16 +157,16 @@ def get_model_arch(model_path: str): """ if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')): # the turbomind model - import configparser + import yaml config_file = os.path.join(model_path, 'triton_models', 'weights', - 'config.ini') - config = configparser.ConfigParser() - config.read(config_file) - model_arch = config['llama']['model_arch'] - tm_config = TurbomindEngineConfig() - for key in config['llama']: - setattr(tm_config, key, config['llama'][key]) - return model_arch, tm_config + 'config.yaml') + with open(config_file, 'r') as f: + config = yaml.safe_load(f) + + from .turbomind.deploy.config import TurbomindModelConfig + tm_config = TurbomindModelConfig.from_dict(config) + + return tm_config.model_config.model_arch, tm_config else: # transformers model try: diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py index 9aa6000505..1239f9d365 100644 --- a/lmdeploy/cli/lite.py +++ b/lmdeploy/cli/lite.py @@ -48,6 +48,34 @@ def add_parser_auto_awq(): default=128, help='Group size for weight quantization statistics') + @staticmethod + def add_parser_auto_gptq(): + """Add parser for auto_gptq command.""" + parser = SubCliLite.subparsers.add_parser( + 'auto_gptq', + formatter_class=DefaultsAndTypesHelpFormatter, + description=SubCliLite.auto_gptq.__doc__, + help=SubCliLite.auto_gptq.__doc__) + parser.set_defaults(run=SubCliLite.auto_gptq) + parser.add_argument('model', + type=str, + help='The path of model in hf format') + ArgumentHelper.revision(parser) + ArgumentHelper.work_dir(parser) + ArgumentHelper.calib_dataset(parser) + ArgumentHelper.calib_samples(parser) + ArgumentHelper.calib_seqlen(parser) + ArgumentHelper.calib_batchsize(parser) + parser.add_argument('--w-bits', + type=int, + default=4, + help='Bit number for weight quantization') + parser.add_argument( + '--w-group-size', + type=int, + default=128, + help='Group size for weight quantization statistics') + @staticmethod def add_parser_calibrate(): """Add parser for calibrate command.""" @@ -97,6 +125,13 @@ def auto_awq(args): kwargs = convert_args(args) auto_awq(**kwargs) + @staticmethod + def auto_gptq(args): + """Perform weight quantization using GPTQ algorithm.""" + from lmdeploy.lite.apis.gptq import auto_gptq + kwargs = convert_args(args) + auto_gptq(**kwargs) + @staticmethod def calibrate(args): """Perform calibration on a given dataset.""" @@ -115,5 +150,6 @@ def smooth_quant(args): def add_parsers(): """Add all parsers.""" SubCliLite.add_parser_auto_awq() + SubCliLite.add_parser_auto_gptq() SubCliLite.add_parser_calibrate() SubCliLite.add_parser_smooth_quant() diff --git a/lmdeploy/lite/apis/gptq.py b/lmdeploy/lite/apis/gptq.py new file mode 100644 index 0000000000..12b88a52cd --- /dev/null +++ b/lmdeploy/lite/apis/gptq.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch +from transformers import AutoTokenizer + +from lmdeploy.lite.utils.calib_dataloader import get_calib_loaders + + +def auto_gptq(model: str, + work_dir: str = './work_dir', + w_bits: int = 4, + w_group_size: int = 128, + calib_dataset: str = 'ptb', + calib_samples: int = 128, + calib_seqlen: int = 2048, + batch_size: int = 1, + revision: str = None): + """Perform weight quantization using AWQ algorithm. + + Args: + model (str): The path of model in hf format. + work_dir (str): The working directory to save results. + calib_dataset (str): The calibration dataset name. + calib_samples (int): The number of samples for calibration. + batch_size (int): The batch size for running the calib samples. + Low GPU mem requires small batch_size. Large batch_size + reduces the calibration time while costs more VRAM. + calib_seqlen (int): The sequence length for calibration. + w_bits (int): Bit number for weight quantization. + w_group_size (int): Group size for weight quantization statistics. + search_scale (bool): Whether search scale ratio. Default to False, + which means only smooth quant with 0.5 ratio will be applied. + device (str): Device type of running. + revision (str): The specific model version to use. It can be a + branch name, a tag name, or a commit id. If unspecified, + will use the default version. + """ + try: + from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig + except Exception: + raise ImportError('To use auto_gptq, please install auto-gptq by ' + 'pip install auto-gptq') + logging.basicConfig( + format='%(asctime)s %(levelname)s [%(name)s] %(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S', + ) + # support internlm2 + from auto_gptq.modeling import GPTQ_CAUSAL_LM_MODEL_MAP + from auto_gptq.modeling._const import SUPPORTED_MODELS + + from ..modeling.internlm2_gptq import InternLM2GPTQForCausalLM + SUPPORTED_MODELS.append('internlm2') + GPTQ_CAUSAL_LM_MODEL_MAP.update(dict(internlm2=InternLM2GPTQForCausalLM)) + + pretrained_model_dir = model + quantized_model_dir = work_dir + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, + trust_remote_code=True) + print('Loading calibrate dataset ...') + calib_loader, _ = get_calib_loaders(calib_dataset, + tokenizer, + nsamples=calib_samples, + seqlen=calib_seqlen) + all_data = [ + data if isinstance(data, torch.Tensor) else data[0] + for data in calib_loader + ] + attention_mask = [1] * calib_seqlen + examples = [ + dict(input_ids=data.flatten().tolist(), attention_mask=attention_mask) + for data in all_data + ] + + quantize_config = BaseQuantizeConfig( + bits=w_bits, # quantize model to 4-bit + group_size=w_group_size, # it is recommended to set the value to 128 + desc_act=False, # lmdeploy only supports False + sym=True, # lmdeploy only supports True + ) + + # load un-quantized model, by default, + # the model will always be loaded into CPU memory + model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, + quantize_config, + revision=revision, + trust_remote_code=True) + + # quantize model, the examples should be list of dict whose keys + # can only be "input_ids" and "attention_mask" + model.quantize(examples, batch_size=batch_size) + + # save quantized model + model.save_quantized(quantized_model_dir) + + tokenizer.save_pretrained(quantized_model_dir) + + +if __name__ == '__main__': + import fire + + fire.Fire(auto_gptq) diff --git a/lmdeploy/lite/modeling/__init__.py b/lmdeploy/lite/modeling/__init__.py new file mode 100644 index 0000000000..ef101fec61 --- /dev/null +++ b/lmdeploy/lite/modeling/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/lmdeploy/lite/modeling/internlm2_gptq.py b/lmdeploy/lite/modeling/internlm2_gptq.py new file mode 100644 index 0000000000..a8b493c46a --- /dev/null +++ b/lmdeploy/lite/modeling/internlm2_gptq.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from auto_gptq.modeling import BaseGPTQForCausalLM + + +class InternLM2GPTQForCausalLM(BaseGPTQForCausalLM): + layer_type = 'InternLM2DecoderLayer' + layers_block_name = 'model.layers' + outside_layer_modules = ['model.tok_embeddings', 'model.norm'] + inside_layer_modules = [ + ['attention.wqkv'], + ['attention.wo'], + ['feed_forward.w3', 'feed_forward.w1'], + ['feed_forward.w2'], + ] diff --git a/lmdeploy/lite/utils/calib_dataloader.py b/lmdeploy/lite/utils/calib_dataloader.py index 27cde59f56..b5cf7e1f6a 100644 --- a/lmdeploy/lite/utils/calib_dataloader.py +++ b/lmdeploy/lite/utils/calib_dataloader.py @@ -22,8 +22,14 @@ def get_wikitext2(tokenizer, nsamples, seed, seqlen): test_enc: Full tokenized Wikitext-2 test set. """ from datasets import load_dataset - traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') - testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + traindata = load_dataset('wikitext', + 'wikitext-2-raw-v1', + split='train', + trust_remote_code=True) + testdata = load_dataset('wikitext', + 'wikitext-2-raw-v1', + split='test', + trust_remote_code=True) trainenc = tokenizer('\n\n'.join(traindata['text']), return_tensors='pt') testenc = tokenizer('\n\n'.join(testdata['text']), return_tensors='pt') @@ -55,10 +61,14 @@ def get_ptb(tokenizer, nsamples, seed, seqlen): test_enc: Full tokenized PTB validation set. """ from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') + traindata = load_dataset('ptb_text_only', + 'penn_treebank', + split='train', + trust_remote_code=True) valdata = load_dataset('ptb_text_only', 'penn_treebank', - split='validation') + split='validation', + trust_remote_code=True) trainenc = tokenizer('\n\n'.join(traindata['sentence']), return_tensors='pt') @@ -96,13 +106,15 @@ def get_c4(tokenizer, nsamples, seed, seqlen): 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', - use_auth_token=False) + use_auth_token=False, + trust_remote_code=True) valdata = load_dataset( 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation', - use_auth_token=False) + use_auth_token=False, + trust_remote_code=True) import random random.seed(seed) @@ -158,8 +170,14 @@ def get_ptb_new(tokenizer, nsamples, seed, seqlen): test_enc: Full tokenized PTB validation set. """ from datasets import load_dataset - traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') - testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + traindata = load_dataset('ptb_text_only', + 'penn_treebank', + split='train', + trust_remote_code=True) + testdata = load_dataset('ptb_text_only', + 'penn_treebank', + split='test', + trust_remote_code=True) trainenc = tokenizer(' '.join(traindata['sentence']), return_tensors='pt') testenc = tokenizer(' '.join(testdata['sentence']), return_tensors='pt') @@ -195,12 +213,14 @@ def get_c4_new(tokenizer, nsamples, seed, seqlen): 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, - split='train') + split='train', + trust_remote_code=True) valdata = load_dataset( 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, - split='validation') + split='validation', + trust_remote_code=True) import random random.seed(seed) @@ -248,7 +268,8 @@ def get_pileval(tokenizer, nsamples, seed, seqlen=512): from datasets.builder import DatasetGenerationError try: dataset = load_dataset('mit-han-lab/pile-val-backup', - split='validation') + split='validation', + trust_remote_code=True) except DatasetGenerationError: raise InterruptedError('There have been some issues when generating ' 'the dataset, you could try to download it ' diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 865c2249de..4d3c97d718 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -23,6 +23,8 @@ class GenerationConfig: input message. **Only 1** is supported now. max_new_tokens (int): The maximum number of tokens that can be generated in the chat completion + do_sample (bool): Whether or not to use sampling, use greedy + decoding otherwise. Default to be False. top_p (float): An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass @@ -36,51 +38,44 @@ class GenerationConfig: random_seed (int): Seed used when sampling a token stop_words (List[str]): Words that stop generating further tokens bad_words (List[str]): Words that the engine will never generate + stop_token_ids (List[int]): List of tokens that stop the generation + when they are generated. The returned output will not contain + the stop tokens. + bad_token_ids (List[str]): List of tokens that the engine will never + generate. min_new_tokens (int): The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt. skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. logprobs (int): Number of log probabilities to return per output token. - """ + response_format (Dict): Only pytorch backend support formatting + response. Examples: `{"type": "json_schema", "json_schema": {"name":"test","schema": {"properties": {"name": {"type": "string"}}, "required": ["name"], "type": "object"}}}` + or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}` + logits_processors (List[Callable]): Custom logit processors. + """ # noqa n: int = 1 max_new_tokens: int = 512 + do_sample: bool = False top_p: float = 1.0 - top_k: int = 1 + top_k: int = 50 temperature: float = 0.8 repetition_penalty: float = 1.0 ignore_eos: bool = False random_seed: int = None stop_words: List[str] = None bad_words: List[str] = None + stop_token_ids: List[int] = None + bad_token_ids: List[int] = None min_new_tokens: int = None skip_special_tokens: bool = True logprobs: int = None + response_format: Optional[Dict] = None logits_processors: Optional[List[LogitsProcessor]] = None - -@dataclass -class EngineGenerationConfig(GenerationConfig): - """generation parameter used by the inference engines.""" - stop_words: List[int] = None - bad_words: List[int] = None - - @staticmethod - def From(gen_config: GenerationConfig, tokenizer: Tokenizer): - """convert `GenerationConfig` to `EngineGenerationConfig` - Args: - gen_config (GenerationConfig): an instance of class `GenerationConfig` - tokenizer (Tokenizer): a tokenizer to encode the `stop_words` and `bad_words` in `gen_config` - - Returns: - EngineGenerationConfig: the generation config used by inference engines - - Examples: - >>> from lmdeploy import Tokenizer, GenerationConfig, EngineGenerationConfig - >>> tokenizer = Tokenizer('internlm/internlm-chat-7b') - >>> gen_config = GenerationConfig(stop_words=['']) - >>> gen_config = EngineGenerationConfig.From(gen_config, tokenizer) - """ # noqa E501 + def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer): + """convert stop_words/bad_sords to ids and append the ids to + stop_token_ids/bad_token_ids.""" def special_word_token_ids(words): if words is not None: @@ -93,21 +88,12 @@ def special_word_token_ids(words): return indexes return None - return EngineGenerationConfig( - n=gen_config.n, - logprobs=gen_config.logprobs, - max_new_tokens=gen_config.max_new_tokens, - min_new_tokens=gen_config.min_new_tokens, - top_p=gen_config.top_p, - top_k=gen_config.top_k, - temperature=gen_config.temperature, - repetition_penalty=gen_config.repetition_penalty, - ignore_eos=gen_config.ignore_eos, - random_seed=gen_config.random_seed, - skip_special_tokens=gen_config.skip_special_tokens, - stop_words=special_word_token_ids(gen_config.stop_words), - bad_words=special_word_token_ids(gen_config.bad_words), - logits_processors=gen_config.logits_processors) + stop_token_ids = special_word_token_ids(self.stop_words) or [] + bad_token_ids = special_word_token_ids(self.bad_words) or [] + stop_token_ids.extend(self.stop_token_ids or []) + bad_token_ids.extend(self.bad_token_ids or []) + self.stop_token_ids = list(set(stop_token_ids)) or None + self.bad_token_ids = list(set(bad_token_ids)) or None def __post_init__(self): """Check input validation.""" @@ -123,18 +109,21 @@ class TurbomindEngineConfig: """TurboMind Engine config. Args: - model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq], - `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth), awq` meaning the quantized model by AWQ. + model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq, gptq], + `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth), + `awq` and `gptq` meaning the quantized model by AWQ and GPTQ, respectively. + If it is not specified, i.e. None, it will be extracted from the input model tp (int): the number of GPU cards used in tensor parallelism, default to 1 session_len (int): the max session length of a sequence, default to None max_batch_size (int): the max batch size during inference, default to 128 cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache. For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache. For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache + cache_chunk_size (int): The policy to apply for KV block from the block manager, default to -1. cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64 enable_prefix_caching (bool): enable cache prompts for block reuse, default to False quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4 - rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention + rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention use_logn_attn (bool): whether or not to use log attn: default to False download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface. revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. @@ -148,6 +137,7 @@ class TurbomindEngineConfig: session_len: Optional[int] = None max_batch_size: int = 128 cache_max_entry_count: float = 0.8 + cache_chunk_size: int = -1 cache_block_seq_len: int = 64 enable_prefix_caching: bool = False quant_policy: int = 0 diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py index a9b1390aa2..2b5ee85edc 100644 --- a/lmdeploy/pytorch/chat.py +++ b/lmdeploy/pytorch/chat.py @@ -5,7 +5,7 @@ from typing import List, Optional from lmdeploy.archs import get_model_arch -from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig +from lmdeploy.messages import GenerationConfig, PytorchEngineConfig from lmdeploy.model import ChatTemplateConfig from lmdeploy.serve.async_engine import get_names_from_model from lmdeploy.tokenizer import DetokenizeState, Tokenizer @@ -52,7 +52,7 @@ def _stop_words(stop_words: List[str], tokenizer: Tokenizer): def run_chat(model_path: str, engine_config: PytorchEngineConfig, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, session_id: int = 1, trust_remote_code: bool = True, chat_template_config: Optional[ChatTemplateConfig] = None): @@ -62,7 +62,7 @@ def run_chat(model_path: str, Args: model_path (str): the huggingface model path. engine_config (PytorchEngineConfig): Config of engine. - gen_config (EngineGenerationConfig): Config of generation. + gen_config (GenerationConfig): Config of generation. session_id (int): the identical id of a session. trust_remote_code (bool): trust remote code. """ @@ -77,7 +77,7 @@ def run_chat(model_path: str, adapter_name = next(iter(engine_config.adapters.keys())) if gen_config is None: - gen_config = EngineGenerationConfig() + gen_config = GenerationConfig() nth_round = 1 step = 0 @@ -113,7 +113,7 @@ def run_chat(model_path: str, print(f'{prompt}', end='', flush=True) state = DetokenizeState(len(input_ids)) gen_config.random_seed = seed - gen_config.stop_words = stop_words + gen_config.stop_token_ids = stop_words for outputs in generator.stream_infer(session_id=session_id, input_ids=input_ids, gen_config=gen_config, @@ -162,12 +162,12 @@ def main(model_path: str, if adapter is not None: adapters = dict(default=adapter) engine_config = PytorchEngineConfig(tp=tp, adapters=adapters) - gen_config = EngineGenerationConfig(max_new_tokens=512, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - ignore_eos=False) + gen_config = GenerationConfig(max_new_tokens=512, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + ignore_eos=False) chat_template_config = None if chat_template is not None and os.path.exists(chat_template): chat_template_config = ChatTemplateConfig.from_json(chat_template) diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py index 1f14ac92b9..6470e09ed3 100644 --- a/lmdeploy/pytorch/check_env/__init__.py +++ b/lmdeploy/pytorch/check_env/__init__.py @@ -23,14 +23,19 @@ def _handle_exception(e: Exception, def check_env_deeplink(device_type: str): - """check Deeplink environment if specific device_type is set.""" + """check Deeplink environment.""" + try_import_deeplink(device_type) + + +def try_import_deeplink(device_type: str): + """import dlinfer if specific device_type is set.""" deeplink_device_type_list = [ 'ascend', ] if device_type in deeplink_device_type_list: logger = get_logger('lmdeploy') try: - import deeplink_ext # noqa: F401 + import dlinfer.framework.lmdeploy_ext # noqa: F401 except Exception as e: _handle_exception(e, 'PyTorch', logger) diff --git a/lmdeploy/pytorch/engine/devices/ascend.py b/lmdeploy/pytorch/engine/devices/ascend.py index a09fa5f655..9c782a3f3d 100644 --- a/lmdeploy/pytorch/engine/devices/ascend.py +++ b/lmdeploy/pytorch/engine/devices/ascend.py @@ -1,10 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch -from .dipu import DIPUDeviceUtils +from .base_device_utils import BaseDeviceUtils -class ASCENDDeviceUtils(DIPUDeviceUtils): +class ASCENDDeviceUtils(BaseDeviceUtils): device = 'ascend' @@ -17,7 +17,8 @@ def update_step_context(cls, step_context): single_attention_mask = torch.logical_not( torch.tril( torch.ones(step_context.q_seq_length[i], - step_context.kv_seq_length[i], + step_context.block_offsets.shape[1] * + block_size, dtype=torch.bool).cuda(), diagonal=step_context.kv_seq_length[i] - step_context.q_seq_length[i], @@ -28,7 +29,7 @@ def update_step_context(cls, step_context): block_loc = step_context.block_offsets[i][block_idx] token_loc = history_length % block_size for _ in range(step_context.q_seq_length[i]): - kv_start_indices.append(block_loc * block_size + token_loc) + kv_start_indices.append([block_loc * block_size + token_loc]) if _ == step_context.q_seq_length[i] - 1: break token_loc = (token_loc + 1) % block_size @@ -38,4 +39,11 @@ def update_step_context(cls, step_context): kv_start_indices, device=step_context.block_offsets.device) setattr(step_context, 'kv_start_indices', kv_start_indices) setattr(step_context, 'attention_mask', attention_mask) + setattr(step_context, 'q_start_loc', step_context.q_start_loc.cpu()) + setattr(step_context, 'q_seq_length', step_context.q_seq_length.cpu()) + setattr(step_context, 'kv_seq_length', + step_context.kv_seq_length.cpu()) + is_unpaged_prefill = (not step_context.is_decoding) and all( + (step_context.q_seq_length == step_context.kv_seq_length).tolist()) + setattr(step_context, 'is_unpaged_prefill', is_unpaged_prefill) return step_context diff --git a/lmdeploy/pytorch/engine/devices/dipu.py b/lmdeploy/pytorch/engine/devices/dipu.py deleted file mode 100644 index d2cc9c4243..0000000000 --- a/lmdeploy/pytorch/engine/devices/dipu.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .base_device_utils import BaseDeviceUtils - - -class DIPUDeviceUtils(BaseDeviceUtils): - - device = 'dipu' - - @classmethod - def update_step_context(cls, step_context): - """update step context.""" - raise NotImplementedError('`update_step_context` of ' - f'<{cls}> not implemented.') diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py index d37ab1b8c7..4a8d02cd7d 100644 --- a/lmdeploy/pytorch/engine/engine.py +++ b/lmdeploy/pytorch/engine/engine.py @@ -7,7 +7,7 @@ import numpy as np import torch -from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig, +from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, ResponseType) from lmdeploy.utils import get_logger, get_model, logging_timer @@ -500,6 +500,7 @@ def _batch_stopping_criteria(self, token_ids: torch.Tensor, @logging_timer('SamplingLogits', logger) def async_sampling_logits(self, logits: torch.Tensor, all_ids: torch.Tensor, + guided_input_ids: torch.Tensor, sampling_inputs: SamplingInputs, inputs: ModelInputs, ignore_eos: torch.Tensor): """sampling logits.""" @@ -514,8 +515,9 @@ def __get_last_logits(): return logits[last_idx, :] split_logits = __get_last_logits().cuda() - logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos) - logits = logits_processor(split_logits, all_ids) + logits_processor = FusedLogitsProcessor(sampling_inputs, ignore_eos, + self.tokenizer.model.model) + logits = logits_processor(all_ids, guided_input_ids, split_logits) next_token_ids = logits_processor.sampling(logits) return next_token_ids @@ -679,7 +681,8 @@ def __get_q_start_loc(): async def _async_step_background( self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict, - all_ids: torch.Tensor, sampling_inputs: SamplingInputs, + all_ids: torch.Tensor, guided_input_ids: torch.Tensor, + sampling_inputs: SamplingInputs, num_appendable_ids: torch.LongTensor, num_ignore_eos: torch.LongTensor, return_logits: bool, output_que: asyncio.Queue): @@ -687,11 +690,16 @@ async def _async_step_background( def __update_inputs(next_token_ids): """update inputs.""" - nonlocal all_ids + nonlocal all_ids, guided_input_ids inputs.update(next_token_ids) if all_ids is not None: all_ids = torch.cat( [all_ids, next_token_ids[:, None].to(all_ids.device)], 1) + if guided_input_ids is not None: + guided_input_ids = torch.cat([ + guided_input_ids, next_token_ids[:, None].to( + guided_input_ids.device) + ], 1) if sampling_inputs.random_offsets is not None: sampling_inputs.random_offsets += 1 @@ -701,6 +709,8 @@ def __update_inputs(next_token_ids): is_decoding = inputs.is_decoding if all_ids is not None: all_ids = all_ids.cuda() + if guided_input_ids is not None: + guided_input_ids = guided_input_ids.cuda() sampling_inputs = sampling_inputs.to_device('cuda') num_appendable_ids = num_appendable_ids.cuda() num_ignore_eos = num_ignore_eos.cuda() @@ -720,7 +730,8 @@ def __update_inputs(next_token_ids): # sampling next_token_ids = self.async_sampling_logits( - logits, all_ids, sampling_inputs, inputs, num_ignore_eos > 0) + logits, all_ids, guided_input_ids, sampling_inputs, inputs, + num_ignore_eos > 0) num_ignore_eos = num_ignore_eos - 1 # stopping criteria @@ -766,6 +777,24 @@ def __gather_all_ids(seqs: SeqList, sampling_inputs: SamplingInputs): output[idx, -h_len:] = h_ids return output + def __gather_guided_input_ids(seqs: SeqList, + sampling_inputs: SamplingInputs): + """gather input ids for guided decode.""" + if not any(sampling_inputs.response_formats or ()): + return None + batch = len(seqs) + max_len = max(seq.num_new_tokens for seq in seqs) + pad_id = self.model_config.bos_token_id + pad_id = 0 if pad_id is None else pad_id + output = torch.full((batch, max_len), pad_id, dtype=torch.int64) + for idx, seq in enumerate(seqs): + h_len = seq.num_new_tokens + if h_len == 0: + continue + h_ids = torch.from_numpy(seq.all_ids[-seq.num_new_tokens:]) + output[idx, -h_len:] = h_ids + return output + def __get_num_appendable_ids(seqs: SeqList): """get num appendable ids.""" ret = [ @@ -802,6 +831,8 @@ def __need_logits(seqs: SeqList): is_prefill) sampling_inputs = SamplingInputs.from_sampling_params(running) all_ids = __gather_all_ids(running, sampling_inputs) + guided_input_ids = __gather_guided_input_ids( + running, sampling_inputs) num_appendable_ids = __get_num_appendable_ids(running) num_ignore_eos = __get_num_ignore_eos(running) return_logits = __need_logits(running) @@ -814,6 +845,7 @@ def __need_logits(seqs: SeqList): swap_in_map=schedule_output.swap_in_map, swap_out_map=schedule_output.swap_out_map, all_ids=all_ids, + guided_input_ids=guided_input_ids, sampling_inputs=sampling_inputs, num_appendable_ids=num_appendable_ids, num_ignore_eos=num_ignore_eos, @@ -912,7 +944,7 @@ async def async_batched_infer( self, session_ids: List[int], token_ids: List[List[int]] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_names: List[str] = None, keep_cache: bool = False, input_embeddings: List[InputEmbeddingType] = None, @@ -922,7 +954,7 @@ async def async_batched_infer( Args: session_ids (List[int]): The session id. token_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. adapter_names (List[str]): The name of the adapters. keep_cache (bool): Keep kv cache after infer. @@ -944,7 +976,7 @@ def batched_infer( self, session_ids: List[int], token_ids: List[List[int]] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_names: List[str] = None, keep_cache: bool = False, input_embeddings: List[InputEmbeddingType] = None, diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py index 9d9ebf9198..4758585eca 100644 --- a/lmdeploy/pytorch/engine/engine_instance.py +++ b/lmdeploy/pytorch/engine/engine_instance.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import List -from lmdeploy.messages import EngineGenerationConfig, EngineOutput +from lmdeploy.messages import EngineOutput, GenerationConfig from lmdeploy.utils import get_logger from ..messages import (InputEmbeddingRangeType, InputEmbeddings, @@ -129,7 +129,7 @@ async def async_stream_infer( self, session_id: int, input_ids: List[int], - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_name: str = None, input_embeddings: InputEmbeddingType = None, input_embedding_ranges: InputEmbeddingRangeType = None, @@ -139,7 +139,7 @@ async def async_stream_infer( Args: session_id (int): The session id. input_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. adapter_name (str): The lora adapter name. Yields: @@ -150,7 +150,7 @@ async def async_stream_infer( if len(input_ids) > self.max_input_len: yield EngineOutput(ResponseType.INPUT_LENGTH_ERROR, [], 0) return - gen_config = gen_config or EngineGenerationConfig() + gen_config = gen_config or GenerationConfig() sampling_param = SamplingParam.from_gen_config(gen_config=gen_config) await self.req_sender.async_send_async( RequestType.ADD_SESSION, dict(session_id=session_id, @@ -191,7 +191,7 @@ async def async_infer( self, session_id: int, input_ids: List[int] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, input_embeddings: InputEmbeddingType = None, input_embedding_ranges: InputEmbeddingRangeType = None, **kwargs): @@ -200,7 +200,7 @@ async def async_infer( Args: session_id (int): The session id. input_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. Returns: int: Error flags. 0 if success. @@ -225,7 +225,7 @@ async def async_infer( def stream_infer(self, session_id: int, input_ids: List[int], - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_name: str = None, input_embeddings: InputEmbeddingType = None, input_embedding_ranges: InputEmbeddingRangeType = None, @@ -235,7 +235,7 @@ def stream_infer(self, Args: session_id (int): The session id. input_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. adapter_name (str): The lora adapter name. Yields: @@ -268,7 +268,7 @@ def __call_async(): yield from __call_async() return - gen_config = gen_config or EngineGenerationConfig() + gen_config = gen_config or GenerationConfig() sampling_param = SamplingParam.from_gen_config(gen_config=gen_config) self.req_sender.send_async(RequestType.ADD_SESSION, dict(session_id=session_id, response=False)) @@ -308,7 +308,7 @@ def __call_async(): def infer(self, session_id: int, input_ids: List[int] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, input_embeddings: InputEmbeddingType = None, input_embedding_ranges: InputEmbeddingRangeType = None, **kwargs): @@ -317,7 +317,7 @@ def infer(self, Args: session_id (int): The session id. input_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. Returns: int: Error flags. 0 if success. @@ -343,7 +343,7 @@ async def async_batched_infer( self, session_ids: List[int], token_ids: List[List[int]] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_names: List[str] = None, keep_cache: bool = False, input_embeddings: List[InputEmbeddingType] = None, @@ -354,7 +354,7 @@ async def async_batched_infer( Args: session_ids (List[int]): The session id. token_ids (List[int]): The input token ids. - gen_config (EngineGenerationConfig): The sampling parameters. + gen_config (GenerationConfig): The sampling parameters. adapter_names (List[str]): The name of the adapters. keep_cache (bool): Keep kv cache after infer. @@ -443,7 +443,7 @@ def batched_infer( self, session_ids: List[int], token_ids: List[List[int]] = None, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, adapter_names: List[str] = None, keep_cache: bool = False, input_embeddings: List[InputEmbeddingType] = None, diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py new file mode 100644 index 0000000000..444a809c81 --- /dev/null +++ b/lmdeploy/pytorch/engine/guided_process.py @@ -0,0 +1,170 @@ +# Copyright 2024- the Outlines developers +# This file is adapted from +# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +import copy +import math +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import defaultdict +from functools import lru_cache +from typing import DefaultDict, Dict, List, Union + +import torch +from outlines.fsm.guide import CFGGuide, Generate, RegexGuide, Write +from outlines.fsm.json_schema import build_regex_from_schema +from pydantic import BaseModel +from transformers import PreTrainedTokenizerBase + + +class BaseLogitsProcessor: + + def init_state(self): + """Initialize the FSM states.""" + self.fsm_state: DefaultDict[int, int] = defaultdict(int) + + def __call__(self, input_ids: List[int], + scores: torch.Tensor) -> torch.Tensor: + """Use the FSM to bias the logits before sampling the next token.""" + + seq_id = hash(tuple(input_ids)) + + if len(input_ids) == 0: + self.init_state() + else: + last_token = input_ids[-1] + last_seq_id = hash(tuple(input_ids[:-1])) + self.fsm_state[seq_id] = self.fsm.get_next_state( + state=self.fsm_state[last_seq_id], token_id=last_token) + + instruction = self.fsm.get_next_instruction(self.fsm_state[seq_id]) + + if type(instruction) == Generate: + allowed_tokens = instruction.tokens + elif type(instruction) == Write: + # TODO: support fast forward tokens + allowed_tokens = [instruction.tokens[0]] + else: + raise TypeError( + f'Unsupported instruction type {type(instruction)}') + + mask = torch.full((scores.shape[-1], ), + -math.inf, + device=scores.device) + mask[allowed_tokens] = 0 + scores.add_(mask) + + return scores + + def adapt_tokenizer(self, tokenizer): + """Adapt tokenizer to use to compile the FSM. + + The API of Outlines tokenizers is slightly different to that of + `transformers`. In addition we need to handle the missing spaces to + Llama's tokenizer to be able to compile FSMs for this model. + """ + from outlines.integrations.utils import adapt_tokenizer + tokenizer = adapt_tokenizer(tokenizer) + # vocab size greater than logits shape because of '[UNUSED_TOKEN_...]' + if hasattr(tokenizer, '_tokenizer'): + tokenizer.vocabulary = tokenizer._tokenizer.get_vocab( + with_added_tokens=False) + return tokenizer + + +class RegexLogitsProcessor(BaseLogitsProcessor): + + def __init__(self, regex_string: str, tokenizer): + """Compile the FSM that drives the regex-structured generation. + + Args: + regex_string: A string that represents a regular expression + tokenizer: The model's tokenizer + """ + tokenizer = self.adapt_tokenizer(copy.deepcopy(tokenizer)) + fsm = RegexGuide(regex_string, tokenizer) + self.fsm = fsm + + +class JSONLogitsProcessor(RegexLogitsProcessor): + + def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer): + """Compile the FSM that drives the JSON-guided generation. + + Args: + schema: A str schema that encodes the structure we want the model + to generate + tokenizer: The model's tokenizer + """ + regex_string = build_regex_from_schema(schema) + super().__init__(regex_string, tokenizer) + + +class CFGLogitsProcessor(BaseLogitsProcessor): + + def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase): + """Compile the FSM that drives the context free grammar generation. + + Parameters + ---------- + cfg + A string that represents a context-free grammar + tokenizer + The model's tokenizer + """ + tokenizer = self.adapt_tokenizer(tokenizer) + fsm = CFGGuide(cfg, tokenizer) + self.fsm = fsm + + +# copied from https://github.com/vllm-project/vllm/blob/a7f65c2be93f491771aca31106f790bf381c0bad/vllm/model_executor/guided_decoding/outlines_decoding.py#L31 # noqa +JSON_GRAMMAR = r""" +?start: object | array + +?value: object +| array +| UNESCAPED_STRING +| SIGNED_NUMBER -> number +| "true" -> true +| "false" -> false +| "null" -> null + +array : "[" [value ("," value)*] "]" +object : "{" [pair ("," pair)*] "}" +pair : UNESCAPED_STRING ":" value + +%import common.UNESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS + +%ignore WS +""" + + +@lru_cache(maxsize=32) +def _get_guided_logits_processor(guide: str, + tokenizer: PreTrainedTokenizerBase, + type: str): + try: + if type == 'json_object': + return CFGLogitsProcessor(guide, tokenizer) + elif type == 'json_schema': + return JSONLogitsProcessor(guide, tokenizer) + elif type == 'regex_schema': + return RegexLogitsProcessor(guide, tokenizer) + else: + return None + except Exception as e: + from lmdeploy.utils import get_logger + logger = get_logger('lmdeploy') + logger.error(e) + return None diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py index 52f99afa35..83c33faaf9 100644 --- a/lmdeploy/pytorch/engine/logits_process.py +++ b/lmdeploy/pytorch/engine/logits_process.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. +import json from dataclasses import asdict, dataclass -from typing import List +from typing import Dict, List, Optional, Tuple import torch from transformers.generation.logits_process import LogitsWarper from lmdeploy.messages import LogitsProcessor +from lmdeploy.tokenizer import Tokenizer from ..messages import SchedulerSequence @@ -95,6 +97,40 @@ def _multinomial_sampling(scores: torch.Tensor, return multinomial_sampling(scores, seeds, offsets, indices) +def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor, + guided_input_ids: Optional[torch.Tensor], + tokenizer: object): + if guided_input_ids is None: + return scores + for i in range(len(response_formats)): + _format = response_formats[i] + if isinstance(_format, Dict) and _format.get('type', 'text') != 'text': + if _format['type'] == 'json_schema': + schema = _format['json_schema'] + if isinstance(schema, Dict): + for key in ['json_schema', 'schema']: + if key in schema: + schema = json.dumps(schema[key]) + elif schema is None: + from .guided_process import JSON_GRAMMAR + schema = JSON_GRAMMAR + elif isinstance(schema, str): + raise ValueError( + f'Cannot parse schema {schema}. The schema must be ' + 'either a dictionary or a string that contains the' + ' JSON Schema specification') + elif _format['type'] == 'regex_schema': + schema = _format.get('regex_schema', '') + else: + raise ValueError(f"unsupported format type: {_format['type']}") + from .guided_process import _get_guided_logits_processor + processor = _get_guided_logits_processor(schema, tokenizer, + _format['type']) + if processor: + scores[i] = processor(guided_input_ids[i].tolist(), scores[i]) + return scores + + @dataclass class SamplingInputs: temperature: torch.Tensor = None @@ -107,6 +143,7 @@ class SamplingInputs: random_offsets: int = None max_top_k: int = 1 min_top_p: float = 1.0 + response_formats: Tuple[str] = () logits_processors: List[List[LogitsProcessor]] = None @classmethod @@ -121,6 +158,7 @@ def from_sampling_params(cls, seqs: List[SchedulerSequence]): stop_words = [None] * batch_size random_seeds = [torch.seed() & 0xffffffff] * batch_size random_offsets = [None] * batch_size + response_formats = [None] * batch_size logits_processors = [None] * batch_size def __gather_params(): @@ -132,6 +170,7 @@ def __gather_params(): top_k[idx] = param.top_k top_p[idx] = param.top_p random_offsets[idx] = seq.random_offsets + response_formats[idx] = param.response_format if param.random_seed is not None: random_seeds[idx] = param.random_seed & 0xffffffff @@ -204,6 +243,7 @@ def __get_bad_words(bad_words): top_p=top_p, random_seeds=random_seeds, random_offsets=random_offsets, + response_formats=tuple(response_formats), max_top_k=max_top_k, min_top_p=min_top_p, logits_processors=logits_processors, @@ -235,21 +275,26 @@ def _apply_custom_logits_processors(batched_logits_processors, all_ids, class FusedLogitsProcessor(LogitsWarper): """Custom logits processor.""" - def __init__(self, sampling_inputs: SamplingInputs, - ignore_eos: torch.Tensor): + def __init__(self, + sampling_inputs: SamplingInputs, + ignore_eos: torch.Tensor, + tokenizer: Optional[Tokenizer] = None): self.sampling_inputs: SamplingInputs = sampling_inputs self.ignore_eos = ignore_eos + self.tokenizer = tokenizer - def __call__(self, scores: torch.FloatTensor, - all_ids: torch.LongTensor) -> torch.FloatTensor: + def __call__(self, all_ids: torch.LongTensor, + guided_input_ids: torch.LongTensor, + scores: torch.FloatTensor) -> torch.FloatTensor: r""" Args: + all_ids (torch.LongTensor): All the token ids. + guided_input_ids (torch.LongTensor): Guided prompt ids. scores (torch.FloatTensor): Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam search or log softmax for each vocabulary token when using beam search - all_ids (torch.LongTensor): All the token ids. Return: @@ -282,6 +327,8 @@ def __call__(self, scores: torch.FloatTensor, stop_words = torch.where(self.ignore_eos[:, None], stop_words, -1) scores = _process_bad_words(scores, stop_words) + scores = _guided_sampling(sampling_inputs.response_formats, scores, + guided_input_ids, self.tokenizer) return scores def sampling(self, logits: torch.Tensor): diff --git a/lmdeploy/pytorch/kernels/ascend/__init__.py b/lmdeploy/pytorch/kernels/ascend/__init__.py index bd207a1ecb..8ab92e0158 100644 --- a/lmdeploy/pytorch/kernels/ascend/__init__.py +++ b/lmdeploy/pytorch/kernels/ascend/__init__.py @@ -1,6 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -from ..dipu import (apply_rotary_pos_emb, fill_kv_cache, fused_rotary_emb, - multinomial_sampling, paged_attention_fwd, rms_norm) +from ..default import multinomial_sampling +from .apply_rotary_pos_emb import apply_rotary_pos_emb +from .fill_kv_cache import fill_kv_cache +from .fused_rotary_emb import fused_rotary_emb +from .moe_gating_topk_softmax import moe_gating_topk_softmax +from .pagedattention import paged_attention_fwd +from .rms_norm import rms_norm __all__ = [ 'rms_norm', @@ -8,5 +13,6 @@ 'fused_rotary_emb', 'fill_kv_cache', 'paged_attention_fwd', + 'moe_gating_topk_softmax', 'multinomial_sampling', ] diff --git a/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py new file mode 100644 index 0000000000..4a4039c44d --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +from torch import Tensor + + +def apply_rotary_pos_emb( + query_states: Tensor, + key_states: Tensor, + cos: Tensor, + sin: Tensor, + position_ids: Tensor, + position_ids_1d: Tensor, + q_embed=None, + k_embed=None, + context=None, +): + bs, head, dim = query_states.shape + num_kv_heads = key_states.shape[1] + query_states_reshaped = query_states.reshape(1, bs, head, dim) + key_states_reshaped = key_states.reshape(1, bs, num_kv_heads, dim) + if not (hasattr(context, 'cos') or hasattr(context, 'sin')): + if len(cos.shape) == 3 and len(sin.shape) == 3: + cos = cos[:, position_ids_1d].view(1, bs, 1, -1) + sin = sin[:, position_ids_1d].view(1, bs, 1, -1) + elif len(cos.shape) == 2 and len(sin.shape) == 2: + cos = cos[position_ids_1d].view(1, bs, 1, -1) + sin = sin[position_ids_1d].view(1, bs, 1, -1) + else: + raise RuntimeError('Cannot handle cos/sin shape dims!') + + if context: + setattr(context, 'cos', cos) + setattr(context, 'sin', sin) + cached_cos = context.cos if context else cos + cached_sin = context.sin if context else sin + query_states, key_states = ext_ops.apply_rotary_pos_emb( + query_states_reshaped, key_states_reshaped, cached_cos, cached_sin, + None, None) + query_states = query_states.view(bs, head, dim) + key_states = key_states.view(bs, num_kv_heads, dim) + if q_embed is None: + q_embed = query_states + else: + q_embed.copy_(query_states) + if k_embed is None: + k_embed = key_states + else: + k_embed.copy_(key_states) + return q_embed, k_embed diff --git a/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py b/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py new file mode 100644 index 0000000000..333e500532 --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +from torch import Tensor + + +def fill_kv_cache( + key_states: Tensor, + value_states: Tensor, + key_caches: Tensor, + value_caches: Tensor, + q_start_loc: Tensor, + q_seq_length: Tensor, + kv_seq_length: Tensor, + max_q_seq_length: int, + block_offsets: Tensor, + context: None, +): + """fill key/value state to cache for paged attention.""" + ext_ops.fill_kv_cache(key_states, value_states, key_caches, value_caches, + context.kv_start_indices) diff --git a/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py new file mode 100644 index 0000000000..03fa2910af --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +import torch +from torch import Tensor + + +def fused_rotary_emb( + query_states: Tensor, + key_states: Tensor, + position_ids: torch.LongTensor, + inv_freq: Tensor, + scaling_factor: float, + out_q: Tensor = None, + out_k: Tensor = None, + context=None, +): + batch, seqlen, head, dim = query_states.shape + num_kv_heads = key_states.shape[-2] + query_states_reshaped = query_states.view(batch, seqlen, head, dim) + key_states_reshaped = key_states.view(batch, seqlen, num_kv_heads, dim) + position_ids = position_ids.squeeze(0).unsqueeze(-1) + pos_freq = position_ids / scaling_factor * inv_freq + if not (hasattr(context, 'cos') or hasattr(context, 'sin')): + cos = (torch.cos(pos_freq).view(batch, seqlen, 1, + -1).repeat(1, 1, 1, + 2).to(query_states.dtype)) + sin = (torch.sin(pos_freq).view(batch, seqlen, 1, + -1).repeat(1, 1, 1, + 2).to(query_states.dtype)) + if context: + setattr(context, 'cos', cos) + setattr(context, 'sin', sin) + cached_cos = context.cos if context else cos + cached_sin = context.sin if context else sin + ext_ops.apply_rotary_pos_emb(query_states_reshaped, key_states_reshaped, + cached_cos, cached_sin, None, None) + if out_q is None: + out_q = query_states + else: + out_q.copy_(query_states) + if out_k is None: + out_k = key_states + else: + out_k.copy_(key_states) + return out_q, out_k diff --git a/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py new file mode 100644 index 0000000000..87b5ad1b39 --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +import torch +from torch import Tensor + + +def moe_gating_topk_softmax(router_logits: Tensor, topk: int): + routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax( + router_logits, topk) + return routing_weights.to(torch.float32), selected_experts.to(torch.int64) diff --git a/lmdeploy/pytorch/kernels/ascend/pagedattention.py b/lmdeploy/pytorch/kernels/ascend/pagedattention.py new file mode 100644 index 0000000000..aa2609e476 --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/pagedattention.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +import torch +from torch import Tensor + + +def prefill_attention( + query_states: Tensor, + key_states: Tensor, + value_states: Tensor, + attn_output: Tensor, + key_cache: Tensor, + value_cache: Tensor, + block_offsets: Tensor, + q_start_loc: Tensor, + q_seq_len: Tensor, + kv_seq_len: Tensor, + block_size: int, + kv_cache_len: int, + context=None, +): + num_q_heads, dim = query_states.shape[1:3] + num_kv_heads = value_states.shape[1] + + if context.is_unpaged_prefill: + ext_ops.prefill_attention( + query_states, + key_states, + value_states, + q_start_loc, + q_seq_len, + context.max_q_seq_length, + num_q_heads, + num_kv_heads, + attn_mask=context.attention_mask, + attn_output=attn_output, + ) + else: + key_cache = key_cache.reshape(1, kv_cache_len, num_kv_heads * dim) + value_cache = value_cache.reshape(1, kv_cache_len, num_kv_heads * dim) + ext_ops.paged_prefill_attention( + query_states, + key_cache, + value_cache, + block_offsets, + block_size, + q_start_loc, + q_seq_len, + kv_seq_len, + num_q_heads, + num_kv_heads, + attn_mask=context.attention_mask, + attn_output=attn_output, + ) + + +def paged_decode_attention(q, k_cache, v_cache, attn_output, kv_seq_len, + max_kv_seq_len, block_offsets, block_size): + num_kv_heads, num_q_heads = k_cache.shape[1], q.shape[1] + ext_ops.paged_decode_attention( + q, + k_cache, + v_cache, + block_offsets, + block_size, + kv_seq_len, + max_kv_seq_len, + num_q_heads, + num_kv_heads, + attn_output=attn_output.view(q.shape), + ) + + +def paged_attention_fwd( + query_states: Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + key_cache: Tensor, + value_cache: Tensor, + attn_output: Tensor, + block_offsets: Tensor, + q_start_loc: Tensor, + q_seqlens: Tensor, + kv_seqlens: Tensor, + max_seqlen: int, + window_size: int = 1, + context=None, +): + is_decoding = query_states.shape[-3] == q_seqlens.size(0) + block_num, block_size, head, dim = key_cache.size() + kv_cache_len = block_num * block_size + k = key_cache.reshape(block_num * block_size, head, dim) + v = value_cache.reshape(block_num * block_size, head, dim) + if not is_decoding: + prefill_attention( + query_states, + key_states, + value_states, + attn_output, + k, + v, + block_offsets, + q_start_loc, + q_seqlens, + kv_seqlens, + block_size, + kv_cache_len, + context=context, + ) + else: + paged_decode_attention( + query_states, + k, + v, + attn_output, + kv_seqlens, + context.max_kv_seq_length, + block_offsets, + block_size, + ) diff --git a/lmdeploy/pytorch/kernels/ascend/rms_norm.py b/lmdeploy/pytorch/kernels/ascend/rms_norm.py new file mode 100644 index 0000000000..57b2f26c21 --- /dev/null +++ b/lmdeploy/pytorch/kernels/ascend/rms_norm.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import dlinfer.ops as ext_ops +from torch import Tensor + + +def rms_norm(hidden_states: Tensor, + weight: Tensor, + eps: float = 1e-6, + out: Tensor = None): + rms_norm_out = ext_ops.rms_norm(hidden_states, weight, eps) + if out is None: + out = rms_norm_out + else: + out.copy_(rms_norm_out) + return out diff --git a/lmdeploy/pytorch/kernels/dipu/__init__.py b/lmdeploy/pytorch/kernels/dipu/__init__.py deleted file mode 100644 index 65ebc8cec1..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from ..default import multinomial_sampling -from .apply_rotary_pos_emb import apply_rotary_pos_emb -from .fill_kv_cache import fill_kv_cache -from .fused_rotary_emb import fused_rotary_emb -from .pagedattention import paged_attention_fwd -from .rms_norm import rms_norm - -__all__ = [ - 'rms_norm', - 'apply_rotary_pos_emb', - 'fused_rotary_emb', - 'fill_kv_cache', - 'paged_attention_fwd', - 'multinomial_sampling', -] diff --git a/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py deleted file mode 100644 index 559cf8afba..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/apply_rotary_pos_emb.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import deeplink_ext.cpp_extensions as ext -from torch import Tensor - - -def apply_rotary_pos_emb( - query_states: Tensor, - key_states: Tensor, - cos: Tensor, - sin: Tensor, - position_ids: Tensor, - position_ids_1d: Tensor, - q_embed=None, - k_embed=None, - context=None, -): - bs, head, dim = query_states.shape - numKeyValueHeads = key_states.shape[1] - query_states = query_states.reshape(bs, head * dim) - key_states = key_states.reshape(bs, numKeyValueHeads * dim) - if not (hasattr(context, 'cos') or hasattr(context, 'sin')): - if cos.dim() == 3: - cos = cos[:, position_ids_1d].view(1, bs, 1, -1) - sin = sin[:, position_ids_1d].view(1, bs, 1, -1) - elif cos.dim() == 2: - cos = cos[position_ids_1d].view(1, bs, 1, -1) - sin = sin[position_ids_1d].view(1, bs, 1, -1) - else: - raise RuntimeError(f'Unsupport cos dim: {cos.dim()}') - setattr(context, 'cos', cos) - setattr(context, 'sin', sin) - ext.rotary_embedding_v2(query_states, key_states, context.cos, context.sin, - dim) - return query_states.view(bs, head, - dim), key_states.view(bs, numKeyValueHeads, dim) diff --git a/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py deleted file mode 100644 index f51b851185..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/fill_kv_cache.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import deeplink_ext.cpp_extensions as ext -from torch import Tensor - - -def fill_kv_cache( - key_states: Tensor, - value_states: Tensor, - key_caches: Tensor, - value_caches: Tensor, - q_start_loc: Tensor, - q_seq_length: Tensor, - kv_seq_length: Tensor, - max_q_seq_length: int, - block_offsets: Tensor, - context: None, -): - """fill key/value state to cache for paged attention.""" - dest_index_copy_kv(key_states, context.kv_start_indices, key_caches) - dest_index_copy_kv(value_states, context.kv_start_indices, value_caches) - - -def dest_index_copy_kv(states, dest_loc, caches): - block_num, block_size, head, dim = caches.size() - caches_tmp = caches.view(block_num * block_size, head, dim) - ext.dest_index_copy_kv(states, dest_loc, caches_tmp) - caches[:] = caches_tmp.view(block_num, block_size, head, dim) diff --git a/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py deleted file mode 100644 index 2a67a24516..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/fused_rotary_emb.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import deeplink_ext.cpp_extensions as ext -import torch -from torch import Tensor - - -def fused_rotary_emb( - query_states: Tensor, - key_states: Tensor, - position_ids: torch.LongTensor, - inv_freq: Tensor, - scaling_factor: float, - out_q: Tensor = None, - out_k: Tensor = None, - context=None, -): - _, bs, head, dim = query_states.shape - _, _, numKeyValueHeads, _ = key_states.shape - query_states = query_states.view(bs, head * dim) - key_states = key_states.view(bs, numKeyValueHeads * dim) - position_ids = position_ids.squeeze(0).unsqueeze(-1) - pos_freq = position_ids / scaling_factor * inv_freq - if not (hasattr(context, 'cos') or hasattr(context, 'sin')): - cos = (torch.cos(pos_freq).view(position_ids.shape[0], 1, - -1).repeat(1, 1, - 2).to(query_states.dtype)) - sin = (torch.sin(pos_freq).view(position_ids.shape[0], 1, - -1).repeat(1, 1, - 2).to(query_states.dtype)) - setattr(context, 'cos', cos) - setattr(context, 'sin', sin) - ext.rotary_embedding_v2(query_states, key_states, context.cos, context.sin, - dim) - query_states = query_states.view(1, bs, head, dim) - key_states = key_states.view(1, bs, numKeyValueHeads, dim) - return query_states, key_states diff --git a/lmdeploy/pytorch/kernels/dipu/pagedattention.py b/lmdeploy/pytorch/kernels/dipu/pagedattention.py deleted file mode 100644 index 9304ec0a35..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/pagedattention.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import deeplink_ext.cpp_extensions as ext -import torch -from torch import Tensor - - -def flash_context_attention( - query_states: Tensor, - key_states: Tensor, - value_states: Tensor, - attn_output: Tensor, - key_cache: Tensor, - value_cache: Tensor, - block_offsets: Tensor, - q_start_loc: Tensor, - q_seqlens: list, - kv_seqlens: list, - block_size: int, - kv_cache_len: int, - context=None, -): - batch, head, dim = ( - q_start_loc.shape[0], - query_states.shape[1], - query_states.shape[2], - ) - numKeyValueHeads = value_states.shape[1] - assert key_states.shape[1] == value_states.shape[1] - for i in range(batch): - start = q_start_loc[i] - end = start + q_seqlens[i] - single_seqlen = int(end - start) - single_q = query_states[start:end].view(1, single_seqlen, -1) - single_k = key_states[start:end].reshape(1, single_seqlen, -1) - single_v = value_states[start:end].reshape(1, single_seqlen, -1) - single_out = attn_output[start:end, :].view(1, single_seqlen, -1) - mask = context.attention_mask[i] - if q_seqlens[i] == kv_seqlens[i]: - ext.prompt_flash_attention( - single_out, - single_q, - single_k, - single_v, - mask, - [kv_seqlens[i]], - kv_seqlens[i], - head, - numKeyValueHeads, - dim, - ) - else: - key_cache = key_cache.reshape(1, kv_cache_len, - numKeyValueHeads * dim) - value_cache = value_cache.reshape(1, kv_cache_len, - numKeyValueHeads * dim) - for j in range(q_seqlens[i]): - single_q = query_states[start + j:start + j + 1].view(1, 1, -1) - single_out = attn_output[start + j:start + j + 1].view( - 1, 1, -1) - ext.paged_attention( - single_out, - single_q, - key_cache, - value_cache, - mask[j:j + 1], - [kv_seqlens[i]], - head, - numKeyValueHeads, - dim, - block_offsets[i:i + 1], - block_size, - ) - - -def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seqlens, - block_table, block_size): - numKeyValueHeads = k_cache.shape[1] - assert k_cache.shape[1] == v_cache.shape[1] - bs, head, dim = q.shape - kv_cache_len = k_cache.shape[0] - q = q.reshape(bs, 1, head * dim) - k_cache = k_cache.reshape(1, kv_cache_len, numKeyValueHeads * dim) - v_cache = v_cache.reshape(1, kv_cache_len, numKeyValueHeads * dim) - ext.paged_attention( - attn_output.view(q.shape), - q, - k_cache, - v_cache, - None, - kv_seqlens, - head, - numKeyValueHeads, - dim, - block_table, - block_size, - ) - - -def paged_attention_fwd( - query_states: Tensor, - key_states: torch.Tensor, - value_states: torch.Tensor, - key_cache: Tensor, - value_cache: Tensor, - attn_output: Tensor, - block_offsets: Tensor, - q_start_loc: Tensor, - q_seqlens: Tensor, - kv_seqlens: Tensor, - max_seqlen: int, - window_size: int = 1, - context=None, -): - is_decoding = query_states.shape[-3] == q_seqlens.size(0) - block_num, block_size, head, dim = key_cache.size() - kv_cache_len = block_num * block_size - k = key_cache.reshape(block_num * block_size, head, dim) - v = value_cache.reshape(block_num * block_size, head, dim) - if not is_decoding: - flash_context_attention( - query_states, - key_states, - value_states, - attn_output, - k, - v, - block_offsets.to(torch.int32), - q_start_loc, - q_seqlens.tolist(), - kv_seqlens.tolist(), - block_size, - kv_cache_len, - context=context, - ) - else: - paged_token_attention( - query_states, - k, - v, - attn_output, - kv_seqlens.tolist(), - block_offsets.to(torch.int32), - block_size, - ) diff --git a/lmdeploy/pytorch/kernels/dipu/rms_norm.py b/lmdeploy/pytorch/kernels/dipu/rms_norm.py deleted file mode 100644 index 8dbcf91ca2..0000000000 --- a/lmdeploy/pytorch/kernels/dipu/rms_norm.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import deeplink_ext.cpp_extensions as ext -import torch -from torch import Tensor - - -def rms_norm(hidden_states: Tensor, weight: Tensor, eps: float = 1e-6): - output = torch.empty_like(hidden_states) - inv_rms_shape = list(hidden_states.shape[:-1]) + [1] - inv_rms = torch.empty(inv_rms_shape, - dtype=torch.float32, - device=hidden_states.device) - ext.rms_norm(output, inv_rms, hidden_states, weight.shape, weight, None, - eps) - return output diff --git a/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py new file mode 100644 index 0000000000..b8a55d4225 --- /dev/null +++ b/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dispatcher import FunctionDispatcher + +moe_gating_topk_softmax = FunctionDispatcher( + 'moe_gating_topk_softmax').make_caller() diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py index c0aa9cf61d..0bb6cf6c40 100644 --- a/lmdeploy/pytorch/messages.py +++ b/lmdeploy/pytorch/messages.py @@ -7,7 +7,7 @@ import numpy as np from torch import Tensor -from lmdeploy.messages import EngineGenerationConfig, LogitsProcessor +from lmdeploy.messages import GenerationConfig, LogitsProcessor from lmdeploy.utils import get_logger from .block import LogicalTokenBlocks @@ -46,15 +46,16 @@ class SamplingParam: bad_words: List[int] = field(default_factory=list) max_new_tokens: int = 512 min_new_tokens: int = 0 + response_format: Optional[str] = None logits_processors: Optional[List[LogitsProcessor]] = None @classmethod - def from_gen_config(self, gen_config: EngineGenerationConfig): + def from_gen_config(self, gen_config: GenerationConfig): """from gen config.""" min_new_tokens = gen_config.min_new_tokens or 0 - stop_words = gen_config.stop_words or [] - bad_words = gen_config.bad_words or [] + stop_words = gen_config.stop_token_ids or [] + bad_words = gen_config.bad_token_ids or [] if gen_config.ignore_eos: bad_words += stop_words stop_words = [] @@ -64,6 +65,7 @@ def from_gen_config(self, gen_config: EngineGenerationConfig): temperature = gen_config.temperature repetition_penalty = gen_config.repetition_penalty max_new_tokens = gen_config.max_new_tokens + response_format = gen_config.response_format if top_p < 0 or top_p > 1.0: logger.warning('`top_p` has to be a float > 0 and < 1' @@ -97,6 +99,7 @@ def from_gen_config(self, gen_config: EngineGenerationConfig): random_seed=gen_config.random_seed, stop_words=stop_words, bad_words=bad_words, + response_format=response_format, max_new_tokens=max_new_tokens, min_new_tokens=min_new_tokens, logits_processors=gen_config.logits_processors) diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py index 1a7c319522..a8c01e45ee 100644 --- a/lmdeploy/pytorch/models/baichuan.py +++ b/lmdeploy/pytorch/models/baichuan.py @@ -67,6 +67,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Rewrite of Attention.forward.""" @@ -186,6 +187,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Rewrite of BaichuanAttention.forward.""" diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py index d472e01947..d2009d30b6 100644 --- a/lmdeploy/pytorch/models/chatglm2.py +++ b/lmdeploy/pytorch/models/chatglm2.py @@ -213,15 +213,14 @@ def _contiguous_batching_forward( return output, kv_cache - def forward( - self, - hidden_states, - attention_mask, - rotary_pos_emb, - kv_cache=None, - use_cache=True, - output_attentions=False, - ): + def forward(self, + hidden_states, + attention_mask, + rotary_pos_emb, + kv_cache=None, + use_cache=True, + output_attentions=False, + **kwargs): return self._contiguous_batching_forward( hidden_states, rotary_pos_emb, diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py index 7bf0468064..331c689682 100644 --- a/lmdeploy/pytorch/models/deepseek.py +++ b/lmdeploy/pytorch/models/deepseek.py @@ -140,6 +140,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """forward.""" diff --git a/lmdeploy/pytorch/models/falcon.py b/lmdeploy/pytorch/models/falcon.py index 8f5ea9a6b1..d68a0c32e7 100644 --- a/lmdeploy/pytorch/models/falcon.py +++ b/lmdeploy/pytorch/models/falcon.py @@ -215,16 +215,15 @@ def __rotary_emb_fn(query_states, key_states, value_states): else: return output_tensor, layer_past - def forward( - self, - hidden_states: torch.Tensor, - alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - ): + def forward(self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + **kwargs): return self._contiguous_batching_forward(hidden_states, alibi, layer_past) diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py index 54def0159e..4e4e140370 100644 --- a/lmdeploy/pytorch/models/internlm.py +++ b/lmdeploy/pytorch/models/internlm.py @@ -123,6 +123,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """forward.""" diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py index 646b002435..5a4329d690 100644 --- a/lmdeploy/pytorch/models/phi3.py +++ b/lmdeploy/pytorch/models/phi3.py @@ -152,6 +152,7 @@ def forward( past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """rewrite of forward.""" diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py index 7a1e9150a1..1a2d9d7488 100644 --- a/lmdeploy/pytorch/models/starcoder2.py +++ b/lmdeploy/pytorch/models/starcoder2.py @@ -170,6 +170,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + **kwargs ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """forward.""" diff --git a/lmdeploy/pytorch/passkey_retrieval.py b/lmdeploy/pytorch/passkey_retrieval.py index 429ee1a423..460fa70317 100644 --- a/lmdeploy/pytorch/passkey_retrieval.py +++ b/lmdeploy/pytorch/passkey_retrieval.py @@ -3,7 +3,7 @@ import os import random -from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig +from lmdeploy.messages import GenerationConfig, PytorchEngineConfig from lmdeploy.model import MODELS from lmdeploy.tokenizer import Tokenizer @@ -33,7 +33,7 @@ def __init__(self, self.generator = self.tm_model.create_instance() self.model = MODELS.get(model_name)() seed = random.getrandbits(64) - self.gen_config = EngineGenerationConfig( + self.gen_config = GenerationConfig( max_new_tokens=32, top_k=40, top_p=0.8, diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 47ff4e083e..93950c85cc 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -5,13 +5,13 @@ import os import random from contextlib import asynccontextmanager +from copy import deepcopy from itertools import count from queue import Empty, Queue from threading import Thread from typing import Any, Dict, List, Literal, Optional, Tuple, Union -from lmdeploy.messages import (EngineGenerationConfig, GenerationConfig, - PytorchEngineConfig, Response, +from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, Response, TurbomindEngineConfig) from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model from lmdeploy.serve.utils import LogitsMixin, _get_event_loop @@ -23,18 +23,17 @@ def get_names_from_model(model_path: str, model_name: str = None): """Get model name and chat template name from workspace model.""" - from configparser import ConfigParser triton_model_path = os.path.join(model_path, 'triton_models', 'weights') if not os.path.exists(triton_model_path): chat_template_name = best_match_model(model_path) else: # `model_path` refers to a turbomind model, reading # chat_template_name from the config - ini_path = os.path.join(triton_model_path, 'config.ini') - with open(ini_path, 'r') as f: - parser = ConfigParser() - parser.read_file(f) - chat_template_name = parser['llama']['chat_template'] + config_path = os.path.join(triton_model_path, 'config.yaml') + with open(config_path, 'r') as f: + import yaml + config = yaml.safe_load(f) + chat_template_name = config['model_config']['chat_template'] model_name = model_name if model_name else model_path return model_name, chat_template_name @@ -290,17 +289,15 @@ async def get_generator(self, stop: bool, session_id: int): self.running_session_ids.add(session_id) return generator - def batch_infer( - self, - prompts: Union[List[str], str, List[Dict], List[List[Dict]]], - gen_config: Optional[Union[GenerationConfig, - List[GenerationConfig], - EngineGenerationConfig, - List[EngineGenerationConfig]]] = None, - do_preprocess: bool = True, - adapter_name: Optional[str] = None, - use_tqdm: bool = False, - **kwargs): + def batch_infer(self, + prompts: Union[List[str], str, List[Dict], + List[List[Dict]]], + gen_config: Optional[Union[GenerationConfig, + List[GenerationConfig]]] = None, + do_preprocess: bool = True, + adapter_name: Optional[str] = None, + use_tqdm: bool = False, + **kwargs): """Inference a batch of prompts. Args: @@ -321,13 +318,10 @@ def batch_infer( assert isinstance(prompts, List), 'prompts should be a list' if gen_config is None: gen_config = GenerationConfig() - # set random if it is not set - if not isinstance(gen_config, List) and gen_config.random_seed is None: - gen_config.random_seed = random.getrandbits(64) if not isinstance(gen_config, List): gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config),\ - 'input gen_confg length differs from the length of prompts' # noqa + assert len(prompts) == len(gen_config), \ + 'input gen_confg length differs from the length of prompts' # noqa prompt_num = len(prompts) session_ids = [next(self._session_id) for _ in range(prompt_num)] outputs = [ @@ -377,9 +371,7 @@ def stream_infer( self, prompts: Union[List[str], str, List[Dict], List[List[Dict]]], gen_config: Optional[Union[GenerationConfig, - List[GenerationConfig], - EngineGenerationConfig, - List[EngineGenerationConfig]]] = None, + List[GenerationConfig]]] = None, do_preprocess: bool = True, adapter_name: Optional[str] = None, **kwargs): @@ -402,13 +394,10 @@ def stream_infer( assert isinstance(prompts, List), 'prompts should be a list' if gen_config is None: gen_config = GenerationConfig() - # set random if it is not set - if not isinstance(gen_config, List) and gen_config.random_seed is None: - gen_config.random_seed = random.getrandbits(64) if not isinstance(gen_config, List): gen_config = [gen_config] * len(prompts) - assert len(prompts) == len(gen_config),\ - 'input gen_confg length differs from the length of prompts' # noqa + assert len(prompts) == len(gen_config), \ + 'input gen_confg length differs from the length of prompts' # noqa session_ids = [next(self._session_id) for _ in range(len(prompts))] outputs = Queue() generators = [] @@ -478,8 +467,7 @@ async def generate( self, messages, session_id: int, - gen_config: Optional[Union[GenerationConfig, - EngineGenerationConfig]] = None, + gen_config: Optional[GenerationConfig] = None, tools: Optional[List[object]] = None, stream_response: bool = True, sequence_start: bool = True, @@ -508,11 +496,17 @@ async def generate( self.id2step[str(session_id)] = step if gen_config is None: gen_config = GenerationConfig() - if type(gen_config) is GenerationConfig: - gen_config = EngineGenerationConfig.From(gen_config, - self.tokenizer) - if gen_config.stop_words is None: - gen_config.stop_words = self.stop_words + else: + gen_config = deepcopy(gen_config) + gen_config.convert_stop_bad_words_to_ids(self.tokenizer) + if gen_config.stop_token_ids is None: + gen_config.stop_token_ids = self.stop_words + if not gen_config.do_sample: + # greedy decode + gen_config.top_k = 1 + # avoid unnecessary process + gen_config.temperature = 1.0 + gen_config.repetition_penalty = 1.0 # set random if it is not set and sequence_start is True if gen_config.random_seed is None and sequence_start: gen_config.random_seed = random.getrandbits(64) @@ -641,8 +635,7 @@ def parse_tool_response(self, text, tools, **kwargs): def chat(self, prompt: str, session=None, - gen_config: Optional[Union[GenerationConfig, - EngineGenerationConfig]] = None, + gen_config: Optional[GenerationConfig] = None, do_preprocess: bool = True, **kwargs) -> Session: """Chat. diff --git a/lmdeploy/serve/gradio/vl.py b/lmdeploy/serve/gradio/vl.py index ebeb371492..3413d62405 100644 --- a/lmdeploy/serve/gradio/vl.py +++ b/lmdeploy/serve/gradio/vl.py @@ -8,7 +8,7 @@ from packaging.version import Version, parse from PIL import Image -from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig, +from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig) from lmdeploy.model import ChatTemplateConfig from lmdeploy.pytorch.engine.request import _run_until_complete @@ -128,11 +128,11 @@ def chat(chatbot, session, max_new_tokens, top_p, top_k, temperature): ' Please restart the session by reset button.') yield chatbot, session, enable_btn, disable_btn, enable_btn else: - gen_config = EngineGenerationConfig(max_new_tokens=max_new_tokens, - top_p=top_p, - top_k=top_k, - temperature=temperature, - stop_words=engine.stop_words) + gen_config = GenerationConfig(max_new_tokens=max_new_tokens, + top_p=top_p, + top_k=top_k, + temperature=temperature, + stop_token_ids=engine.stop_words) step = session.step state = DetokenizeState(len(input_ids)) for outputs in generator.stream_infer( diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index c434faf86f..34a134973b 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -298,6 +298,11 @@ async def chat_completions_v1(request: ChatCompletionRequest, 1.0 means no penalty - stop (str | List[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. + - response_format (Dict | None): Only pytorch backend support formatting + response. Examples: `{"type": "json_schema", "json_schema": {"name": + "test","schema": {"properties": {"name": {"type": "string"}}, + "required": ["name"], "type": "object"}}}` + or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}` - logit_bias (Dict): Bias to logits. Only supported in pytorch engine. - tools (List): A list of tools the model may call. Currently, only internlm2 functions are supported as a tool. Use this to specify a @@ -345,6 +350,13 @@ async def chat_completions_v1(request: ChatCompletionRequest, gen_logprobs, logits_processors = None, None if request.logprobs and request.top_logprobs: gen_logprobs = request.top_logprobs + response_format = None + if request.response_format and request.response_format.type != 'text': + if VariableInterface.async_engine.backend != 'pytorch': + return create_error_response( + HTTPStatus.BAD_REQUEST, + 'only pytorch backend can use response_format now') + response_format = request.response_format.model_dump() if request.logit_bias is not None: try: @@ -360,6 +372,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, gen_config = GenerationConfig( max_new_tokens=request.max_tokens, + do_sample=True, logprobs=gen_logprobs, top_k=request.top_k, top_p=request.top_p, @@ -368,6 +381,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, ignore_eos=request.ignore_eos, stop_words=request.stop, skip_special_tokens=request.skip_special_tokens, + response_format=response_format, logits_processors=logits_processors, random_seed=random_seed) @@ -590,6 +604,7 @@ async def completions_v1(request: CompletionRequest, gen_config = GenerationConfig( max_new_tokens=request.max_tokens if request.max_tokens else 512, + do_sample=True, logprobs=request.logprobs, top_k=request.top_k, top_p=request.top_p, @@ -675,7 +690,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: # Non-streaming response usage = UsageInfo() - choices = [] + choices = [None] * len(generators) async def _inner_call(i, generator): final_logprobs = [] @@ -704,12 +719,12 @@ async def _inner_call(i, generator): assert final_res is not None choice_data = CompletionResponseChoice( - index=0, + index=i, text=text, finish_reason=final_res.finish_reason, logprobs=logprobs, ) - choices.append(choice_data) + choices[i] = choice_data total_tokens = sum([ final_res.history_token_len, final_res.input_token_len, @@ -841,6 +856,7 @@ async def chat_interactive_v1(request: GenerateRequest, gen_config = GenerationConfig( max_new_tokens=request.request_output_len, + do_sample=True, top_p=request.top_p, top_k=request.top_k, temperature=request.temperature, @@ -963,7 +979,7 @@ def serve(model_path: str, api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as a single api_key. Default to None, which means no api key applied. ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'. - """ # noqa E501 + """ # noqa E501 if os.getenv('TM_LOG_LEVEL') is None: os.environ['TM_LOG_LEVEL'] = log_level logger.setLevel(log_level) diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 48c46cae25..bd54028c39 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -87,6 +87,25 @@ class StreamOptions(BaseModel): include_usage: Optional[bool] = False +class JsonSchema(BaseModel): + name: str + # description is not used since it depends on model + description: Optional[str] = None + # use alias since pydantic does not support the OpenAI key `schema` + json_schema: Optional[Dict[str, Any]] = Field(default=None, + alias='schema', + examples=[None]) + # strict is not used + strict: Optional[bool] = False + + +class ResponseFormat(BaseModel): + # regex_schema is extended by lmdeploy to support regex output + type: Literal['text', 'json_object', 'json_schema', 'regex_schema'] + json_schema: Optional[JsonSchema] = None + regex_schema: Optional[str] = None + + class ChatCompletionRequest(BaseModel): """Chat completion request.""" model: str @@ -99,7 +118,7 @@ class ChatCompletionRequest(BaseModel): logprobs: Optional[bool] = False top_logprobs: Optional[int] = None n: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = None + logit_bias: Optional[Dict[str, float]] = Field(default=None, examples=[None]) # noqa max_tokens: Optional[int] = Field(default=None, examples=[None]) stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None]) # noqa # yapf: enable @@ -109,6 +128,8 @@ class ChatCompletionRequest(BaseModel): presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 user: Optional[str] = None + response_format: Optional[ResponseFormat] = Field(default=None, + examples=[None]) # noqa # additional argument of lmdeploy repetition_penalty: Optional[float] = 1.0 session_id: Optional[int] = -1 diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py index f8e707e5c6..e9d9115de0 100644 --- a/lmdeploy/serve/vl_async_engine.py +++ b/lmdeploy/serve/vl_async_engine.py @@ -3,6 +3,7 @@ import numpy as np +from lmdeploy.pytorch.check_env import try_import_deeplink from lmdeploy.serve.async_engine import AsyncEngine from lmdeploy.utils import get_logger from lmdeploy.vl.constants import IMAGE_DUMMY_TOKEN_INDEX, IMAGE_TOKEN @@ -18,6 +19,8 @@ class VLAsyncEngine(AsyncEngine): def __init__(self, model_path: str, **kwargs) -> None: vision_config = kwargs.pop('vision_config', None) backend_config = kwargs.get('backend_config', None) + if kwargs.get('backend', '') == 'pytorch': + try_import_deeplink(backend_config.device_type) self.vl_encoder = ImageEncoder(model_path, vision_config, backend_config=backend_config) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index ba488b77a4..ade7875ce1 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -3,7 +3,7 @@ import random from lmdeploy.archs import get_model_arch -from lmdeploy.messages import EngineGenerationConfig, TurbomindEngineConfig +from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig from lmdeploy.model import ChatTemplateConfig from lmdeploy.serve.async_engine import get_names_from_model from lmdeploy.tokenizer import DetokenizeState @@ -70,7 +70,7 @@ def main(model_path: str, request_output_len (int): output token nums chat_template_config (ChatTemplateConfig): chat template config kwargs (dict): unused args - """ # noqa: E 501 + """ # noqa: E 501 # chat template _, chat_template_name = get_names_from_model(model_path) @@ -110,12 +110,12 @@ def main(model_path: str, if stop_words is not None: stop_words = stop_words[0][0].tolist() - gen_config = EngineGenerationConfig(max_new_tokens=request_output_len, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - stop_words=stop_words) + gen_config = GenerationConfig(max_new_tokens=request_output_len, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + stop_token_ids=stop_words) nth_round = 1 step = 0 diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py new file mode 100644 index 0000000000..bec6120b7b --- /dev/null +++ b/lmdeploy/turbomind/deploy/config.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import json +from dataclasses import asdict, fields + +# use pydantic.dataclasses.dataclass to check data type +from pydantic.dataclasses import dataclass + +from lmdeploy.messages import TurbomindEngineConfig + + +def config_from_dict(cls, env): + """initiate an instance of a config class from a dict.""" + params = inspect.signature(cls).parameters + used = {k: v for k, v in env.items() if k in params and v is not None} + return cls(**used) + + +def config_to_dict(config): + """export config to a dict.""" + if not config: + return dict() + assert isinstance(config, (ModelConfig, AttentionConfig, LoraConfig)), \ + f'A dataclass is expected, but got {type(config)}' + + return asdict(config) + + +@dataclass +class ModelConfig: + model_name: str = '' + chat_template: str = '' + model_arch: str = None + head_num: int = None + kv_head_num: int = None + hidden_units: int = None + vocab_size: int = None + num_layer: int = None + inter_size: int = None + norm_eps: float = None + attn_bias: int = None + start_id: int = None + end_id: int = None + size_per_head: int = 128 + group_size: int = 0 + weight_type: str = None + session_len: int = None + tp: int = 1 + model_format: str = 'hf' + + +@dataclass +class AttentionConfig: + rotary_embedding: int = 128 + rope_theta: float = 10000.0 + max_position_embeddings: int = 0 + original_max_position_embeddings: int = 0 + rope_scaling_type: str = '' + rope_scaling_factor: float = 0.0 + use_dynamic_ntk: int = 0 + low_freq_factor: float = 1.0 + high_freq_factor: float = 1.0 + use_logn_attn: int = 0 + cache_block_seq_len: int = 64 + + +@dataclass +class LoraConfig: + lora_policy: str = '' + lora_r: int = 0 + lora_scale: float = 0.0 + lora_max_wo_r: int = 0 + lora_rank_pattern: str = '' + lora_scale_pattern: str = '' + + +@dataclass +class TurbomindModelConfig: + """Config for turbomind model.""" + model_config: ModelConfig = None + attention_config: AttentionConfig = None + lora_config: LoraConfig = None + + def update_from_engine_config(self, config: TurbomindEngineConfig): + """Update the attributes of this instance with the attributes from + TurbomindEngineConfig. + + Args: + config (TurbomindEngineConfig): The turbomind engine config + """ + if config is None: + return + for key, value in asdict(config).items(): + if not value: + continue + + if hasattr(self.model_config, key): + setattr(self.model_config, key, value) + if hasattr(self.attention_config, key): + setattr(self.attention_config, key, value) + + @classmethod + def from_dict(cls, config: dict = {}): + """construct TurbomindModelConfig instance from config in a dict.""" + _cfg = { + field.name: config.get(field.name, {}) + for field in fields(TurbomindModelConfig) + } + + return TurbomindModelConfig( + model_config=config_from_dict(ModelConfig, _cfg['model_config']), + attention_config=config_from_dict(AttentionConfig, + _cfg['attention_config']), + lora_config=config_from_dict(LoraConfig, _cfg['lora_config'])) + + def to_dict(self): + """export to a dict.""" + return dict(model_config=config_to_dict(self.model_config), + attention_config=config_to_dict(self.attention_config), + lora_config=config_to_dict(self.lora_config)) + + @property + def session_len(self): + return self.model_config.session_len + + @property + def tensor_para_size(self): + return self.model_config.tp + + @property + def weight_type(self): + return self.model_config.weight_type + + @property + def group_size(self): + return self.model_config.group_size + + def __str__(self): + return json.dumps(self.to_dict(), indent=2) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 441b3cbe22..bce9bbd614 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -13,10 +13,11 @@ from ...utils import _get_and_verify_max_len from ..supported_models import SUPPORTED_ARCHS, is_supported +from .config import TurbomindModelConfig from .exporter import get_exporter_factory from .policy import get_input_policy from .source_model.base import INPUT_MODELS -from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig +from .target_model.base import OUTPUT_MODELS SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', 'gptq', None] logger = get_logger('lmdeploy') @@ -93,14 +94,14 @@ def get_output_model_registered_name_and_config(model_path: str, Args: model_path (str): the path of the input model model_format (str): the format of the model, which can be one of - ['meta_llama', 'hf', 'awq'] + ['meta_llama', 'hf', 'awq', 'gptq'] group_size (int): the size of group used by awq model """ register_name = 'tm' turbomind_model_arch = 'llama' weight_type = 'fp16' - config = TurbomindModelConfig.from_dict({}, allow_none=True) + config = TurbomindModelConfig.from_dict() if model_format == 'meta_llama': session_len = 2048 @@ -124,10 +125,11 @@ def get_output_model_registered_name_and_config(model_path: str, 'Device does not support bfloat16. Set float16 forcefully') weight_type = 'fp16' - config.model_arch = model_arch - config.session_len = session_len + 8 - config.weight_type = weight_type - config.group_size = group_size + config.model_config.model_arch = model_arch + config.model_config.weight_type = weight_type + config.model_config.model_format = model_format + config.model_config.group_size = group_size + config.model_config.session_len = session_len lora_type = 'plora' if turbomind_model_arch == 'xcomposer2' else '' @@ -181,7 +183,7 @@ def find_quantization_config(nested, target_key): def get_tm_model(model_path, model_name, chat_template_name, - engine_config, + engine_config: TurbomindEngineConfig, group_size: int = None, out_dir: str = None): """Create turbomind model. @@ -215,9 +217,6 @@ def get_tm_model(model_path, f'mismatched quant group size: user input "{group_size}" ' \ f'vs model quant_config "{_group_size}"' - engine_config.model_format = quant_method - group_size = _group_size - if quant_method == 'awq': assert version == 'gemm', \ f'unsupported quant config: {quant_config}' @@ -228,6 +227,9 @@ def get_tm_model(model_path, else: assert 0, f'unsupported quant_config: {quant_config}' + engine_config.model_format = quant_method + group_size = _group_size + # Compatible to awq models that are quantized by lmdeploy (<=v0.3.0) if not group_size: group_size = 128 @@ -245,38 +247,28 @@ def get_tm_model(model_path, tokenizer_path=model_path, input_policy=input_policy) - output_model_name, cfg, exporter_factory = \ + output_model_name, tm_cfg, exporter_factory = \ get_output_model_registered_name_and_config( model_path=model_path, model_format=engine_config.model_format, group_size=group_size) - cfg.chat_template = chat_template_name - cfg.model_name = model_name - cfg.tensor_para_size = engine_config.tp + tm_cfg.model_config.chat_template = chat_template_name + tm_cfg.model_config.model_name = model_name + tm_cfg.model_config.tp = engine_config.tp output_model = OUTPUT_MODELS.get(output_model_name)( input_model=input_model, - cfg=cfg, + cfg=tm_cfg, exporter_factory=exporter_factory, out_dir=out_dir) - if engine_config.rope_scaling_factor == 0: - # to avoid `rope_scaling_factor` from engine_config override - # the rope_scaling_factor in TurbomindModelConfig - engine_config.rope_scaling_factor = None - output_model.cfg.update_from_engine_config(engine_config) - # cast bool to int, otherwise, the bool variables will be saved to - # config.ini as string - # TODO(lvhan): change config.ini to config.yaml - output_model.cfg.enable_prefix_caching = int( - output_model.cfg.enable_prefix_caching) - output_model.cfg.use_logn_attn = int(output_model.cfg.use_logn_attn) + return output_model def main(model_name: str, model_path: str, - model_format: str = None, + model_format: str = 'hf', chat_template: str = None, tokenizer_path: str = None, dst_path: str = 'workspace', @@ -291,10 +283,10 @@ def main(model_name: str, model_name (str): unused any longer model_path (str): the directory path of the model model_format (str): the format of the model, should choose from - ['meta_llama', 'hf', 'awq', None]. 'meta_llama' stands for META's - llama format, 'hf' means huggingface llama format, and 'awq' means - llama(hf) model quantized by lmdeploy/lite/quantization/awq.py. - The default value is None + ['meta_llama', 'hf', 'awq', 'gptq']. 'meta_llama' stands for META's + llama format, 'hf' means huggingface model, and 'awq', `gptq` + means models quantized by `autoawq` and `autogptq` respectively. + The default value is hf chat_template (str): the name of the built-in chat template. tokenizer_path (str): the path of tokenizer model dst_path (str): the destination path that saves outputs diff --git a/lmdeploy/turbomind/deploy/exporter.py b/lmdeploy/turbomind/deploy/exporter.py index 48f9312fa6..9667d34583 100644 --- a/lmdeploy/turbomind/deploy/exporter.py +++ b/lmdeploy/turbomind/deploy/exporter.py @@ -74,9 +74,9 @@ class BaseExporter(ABC): def __init__(self, model: BaseOutputModel): self.model = model - self.tp = model.cfg.tensor_para_size - self.head_dim = model.cfg.size_per_head - self.inter_size = model.cfg.inter_size + self.tp = model.tensor_para_size + self.head_dim = model.model_config.size_per_head + self.inter_size = model.model_config.inter_size def export_attn(self, idx: int, qkvo, kind: str, pack_fn=identity): if all(x is None for x in qkvo): @@ -156,7 +156,7 @@ class QuantWeightExporter(BaseExporter): def __init__(self, model: BaseOutputModel, pack_fn): super().__init__(model) self.pack_fn = pack_fn - self.group_size = model.cfg.group_size + self.group_size = model.tm_config.group_size def export(self, r: BaseReader, i: int): diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py index 87983b2551..6b839876fe 100644 --- a/lmdeploy/turbomind/deploy/target_model/base.py +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -1,20 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. -import configparser -import copy -import inspect -import io -import json import os.path as osp from abc import ABC, abstractmethod -from configparser import ConfigParser import torch import tqdm +import yaml from mmengine import Registry -from pydantic.dataclasses import dataclass - -from lmdeploy.messages import TurbomindEngineConfig +from ..config import (AttentionConfig, LoraConfig, ModelConfig, + TurbomindModelConfig, config_from_dict, config_to_dict) from ..source_model.base import BaseInputModel, BaseReader OUTPUT_MODELS = Registry( @@ -31,122 +25,8 @@ def tprint(*args, **kwargs): tqdm.tqdm.write(s.getvalue()) -@dataclass -class TurbomindModelConfig: - """Config for turbomind model.""" - - model_name: str = '' - chat_template: str = '' - model_arch: str = None - tensor_para_size: int = None - head_num: int = None - kv_head_num: int = None - hidden_units: int = None - vocab_size: int = None - num_layer: int = None - inter_size: int = None - norm_eps: float = None - attn_bias: int = None - start_id: int = None - end_id: int = None - session_len: int = None - weight_type: str = None - rotary_embedding: int = 128 - rope_theta: float = 10000.0 - size_per_head: int = 128 - group_size: int = 0 - max_batch_size: int = 64 - max_prefill_token_num: int = 8192 - max_context_token_num: int = 1 - step_length: int = 1 - cache_max_entry_count: float = 0.8 - cache_block_seq_len: int = 64 - cache_chunk_size: int = -1 - enable_prefix_caching: bool = False - num_tokens_per_iter: int = 0 - max_prefill_iters: int = 1 - use_context_fmha: int = 1 - quant_policy: int = 0 - max_position_embeddings: int = 0 - original_max_position_embeddings: int = 0 - rope_scaling_type: str = '' - rope_scaling_factor: float = 0.0 - use_dynamic_ntk: int = 0 - low_freq_factor: float = 1.0 - high_freq_factor: float = 1.0 - use_logn_attn: int = 0 - lora_policy: str = '' - lora_r: int = 0 - lora_scale: float = 0.0 - lora_max_wo_r: int = 0 - lora_rank_pattern: str = '' - lora_scale_pattern: str = '' - - @classmethod - def from_dict(cls, env, allow_none=False): - """Construct from dict.""" - params = inspect.signature(cls).parameters - used = {k: v for k, v in env.items() if k in params and v is not None} - if not allow_none: - return cls(**used) - else: - default = { - k: None - for k in params.keys() if params[k].default is inspect._empty - } - default.update(used) - return cls(**default) - - def update_from_engine_config(self, config: TurbomindEngineConfig): - """Update the attributes of this instance with the attributes from - TurbomindEngineConfig. - - Args: - config (TurbomindEngineConfig): The turbomind engine config - """ - if config is None: - return - # Iterate over the fields of 'self' - for field_name, _ in self.__dataclass_fields__.items(): - # If the field value in 'other' is not None, - # update the corresponding field in 'self' - if hasattr(config, field_name) and getattr(config, - field_name) is not None: - setattr(self, field_name, getattr(config, field_name)) - - self.tensor_para_size = config.tp - assert self.session_len is not None - if config.max_prefill_token_num is not None and \ - config.num_tokens_per_iter == 0: - self.num_tokens_per_iter = config.max_prefill_token_num - self.max_prefill_iters = (self.session_len + - config.max_prefill_token_num - - 1) // config.max_prefill_token_num - - def toini(self): - config = copy.deepcopy(self.__dict__) - parser = ConfigParser() - parser['llama'] = config - with io.StringIO() as ss: - parser.write(ss) - ss.seek(0) - ini = ss.read() - return ini - - def __str__(self): - return json.dumps(self.__dict__, indent=2) - - @property - def valid(self): - """Check if cfg is valid.""" - for _, v in self.__dict__.items(): - if v is None: - return False - return True - - def _weight_dtype_map(weight_type: str, default=None): - """get weight dtype map.""" + """map literal data type to torch dtype.""" _WEIGHT_DTYPE_MAP = dict( int4=torch.float16, @@ -169,47 +49,65 @@ def __init__(self, out_dir: str = ''): super().__init__() self.input_model = input_model - self.cfg = cfg - if not cfg.valid: - self.cfg = self.get_config(cfg) - assert self.cfg.valid - assert self.cfg.kv_head_num % self.cfg.tensor_para_size == 0 + self.model_config = cfg.model_config + self.attention_config = cfg.attention_config + self.lora_config = cfg.lora_config + self.tensor_para_size = self.model_config.tp self.out_dir = out_dir self.to_file = True if out_dir else False self.tm_params = {} - model_info = self.input_model.model_info() - self.permute_qk = model_info.get('permute_qk', True) + + # get `model_info` and `tokenizer_info` at first, which + # will be updated to `self.model_config` and `self.attention_config` + self.input_model_info = self.input_model.model_info() + self.input_model_tokenizer_info = self.input_model.tokenizer_info() + self.permute_qk = self.input_model_info.get('permute_qk', True) + + self.update_model_config() + assert self.model_config.kv_head_num % self.tensor_para_size == 0 + + self.update_attention_config() + self.update_lora_config() # ! Dependency on `self` self.exporters = exporter_factory(self) @abstractmethod - def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig: - """Generate turbomind model config (config.ini).""" - _, bos_id, eos_id = self.input_model.tokenizer_info() + def update_model_config(self): + """Update `self.model_config` according to the input_model's + `tokenizer_info` and `model_info`""" + _, bos_id, eos_id = self.input_model_tokenizer_info - final_cfg = cfg.__dict__ + final_cfg = config_to_dict(self.model_config) final_cfg.update(dict(start_id=bos_id, end_id=eos_id)) - final_cfg.update(self.input_model.model_info()) + final_cfg.update(self.input_model_info) - # vocab_size + # get vocab_size for bin in self.input_model.bins(): emb = bin.tok_embeddings() if emb is not None: - _vocab_size, dim = emb.shape + _vocab_size, _ = emb.shape break final_cfg.update(dict(vocab_size=_vocab_size)) - return TurbomindModelConfig.from_dict(final_cfg, allow_none=True) + self.model_config = config_from_dict(ModelConfig, final_cfg) + + def update_attention_config(self): + """update attention config according to input model's model info.""" + final_cfg = config_to_dict(self.attention_config) + final_cfg.update(self.input_model_info) + self.attention_config = config_from_dict(AttentionConfig, final_cfg) + + def update_lora_config(self): + """update lora config according to input model's model info.""" + final_cfg = config_to_dict(self.lora_config) + final_cfg.update(self.input_model_info) + self.lora_config = config_from_dict(LoraConfig, final_cfg) def export_config(self) -> None: """export turbomind config.""" if self.to_file: - config = configparser.ConfigParser() - cfg = dict(llama=self.cfg.__dict__) - for section, key_values in cfg.items(): - config[section] = key_values - config_path = osp.join(self.out_dir, 'config.ini') + config_path = osp.join(self.out_dir, 'config.yaml') with open(config_path, 'w') as f: - config.write(f) + yaml.safe_dump(self.tm_config.to_dict(), f) def export_weight(self, param: torch.Tensor, name: str) -> None: """export turbomind weight.""" @@ -222,14 +120,14 @@ def _tofile(tensor, path): if self.to_file: if torch.is_floating_point(param): - torch_type = _weight_dtype_map(self.cfg.weight_type, + torch_type = _weight_dtype_map(self.model_config.weight_type, torch.float16) param = param.to(torch_type) tprint(name, param.shape) _tofile(param, osp.join(self.out_dir, name)) elif len(self.tm_params) > 0: tm_params = self.tm_params - weight_type = self.cfg.weight_type + weight_type = self.model_config.weight_type assert weight_type in ['fp16', 'fp32', 'bf16', 'int4'] # currently, the tensor type should in @@ -269,7 +167,7 @@ def save_split(self, split_dim = None copy = True - tp = self.cfg.tensor_para_size + tp = self.tensor_para_size if split_dim is not None: tprint( f'*** splitting {name}, shape={tensor.shape}, ' @@ -295,7 +193,7 @@ def save_split(self, def export(self) -> None: """Export to turbomind model format.""" - num_layer = self.cfg.num_layer + num_layer = self.model_config.num_layer from tqdm import tqdm pbar = tqdm(total=num_layer, desc='Convert to turbomind format', @@ -321,8 +219,8 @@ def export_misc(self, bin: BaseReader) -> None: def pad_weight(tensor): pad_size = None - vocab_size = self.cfg.vocab_size - tp = self.cfg.tensor_para_size + vocab_size = self.model_config.vocab_size + tp = self.tensor_para_size if vocab_size % tp != 0: pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size @@ -344,3 +242,9 @@ def export_transformer_block(self, bin: BaseReader, i: int) -> None: """Export transformer block.""" for e in self.exporters: e.export(bin, i) + + @property + def tm_config(self): + return TurbomindModelConfig(model_config=self.model_config, + attention_config=self.attention_config, + lora_config=self.lora_config) diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py index 57c958fd36..14e1115b20 100644 --- a/lmdeploy/turbomind/deploy/target_model/fp.py +++ b/lmdeploy/turbomind/deploy/target_model/fp.py @@ -1,16 +1,23 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .base import OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig +from ..config import ModelConfig, config_from_dict, config_to_dict +from .base import OUTPUT_MODELS, BaseOutputModel @OUTPUT_MODELS.register_module(name='tm') class TurbomindModel(BaseOutputModel): """Export to turbomind fp16 format.""" - def get_config(self, cfg: TurbomindModelConfig): - """Get turbomind config.""" - final_cfg = super().get_config(cfg).__dict__ - # attn_bias, inter_size + def update_model_config(self): + """Update `self.model_config`. + + Firstly, call `update_model_config` of the superclass. Then update + `inter_size` and `attn_bias` that are indicates from the input_model's + weight files + """ + super().update_model_config() + final_cfg = config_to_dict(self.model_config) + # get attn_bias, inter_size visit = False attn_bias = 0 for bin in self.input_model.bins(): @@ -24,13 +31,13 @@ def get_config(self, cfg: TurbomindModelConfig): break if visit: break - inter_size = self._pad_inter_size(inter_size, final_cfg) + inter_size = self._pad_inter_size(inter_size) final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size)) - return TurbomindModelConfig.from_dict(final_cfg) + self.model_config = config_from_dict(ModelConfig, final_cfg) - def _pad_inter_size(self, inter_size: int, cfg: dict): - group_size = max(1, cfg['group_size']) - tp = cfg['tensor_para_size'] + def _pad_inter_size(self, inter_size: int): + group_size = max(1, self.model_config.group_size) + tp = self.tensor_para_size groups_per_rank = (inter_size // group_size + tp - 1) // tp inter_size_padded = groups_per_rank * group_size * tp return inter_size_padded diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py index 75e70e669a..91b057d723 100644 --- a/lmdeploy/turbomind/generate_gemm_config.py +++ b/lmdeploy/turbomind/generate_gemm_config.py @@ -15,22 +15,19 @@ def get_llama_gemm(): return bin_path -def read_config(ini_path: str): +def read_config(config_file: str): """read turbomind config from turbomind. Args: - ini_path (str): the path of `config.ini` file in turbomind model + config_file (str): the path of turbomind config file in turbomind model """ - from configparser import ConfigParser + + import yaml from lmdeploy.turbomind.deploy.target_model.base import \ TurbomindModelConfig - - with open(ini_path, 'r') as f: - parser = ConfigParser() - parser.read_file(f) - section_name = 'llama' - _cfg = parser._sections[section_name] + with open(config_file, 'r') as f: + _cfg = yaml.safe_load(f) cfg = TurbomindModelConfig.from_dict(_cfg) return cfg.head_num, cfg.size_per_head, cfg.inter_size, \ cfg.vocab_size, cfg.tensor_para_size @@ -52,7 +49,7 @@ def main(head_num: int = 32, head_num, size_per_head, inter_size, vocab_size, \ tensor_para_size = read_config( osp.join(model_path, - 'triton_models', 'weights', 'config.ini')) + 'triton_models', 'weights', 'config.yaml')) else: from transformers import AutoConfig config = AutoConfig.from_pretrained(model_path, diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index e1ab172bf1..3b05e5717c 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -1,25 +1,26 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio +import json import os.path as osp import sys from concurrent.futures import ThreadPoolExecutor -from configparser import ConfigParser +from dataclasses import asdict from itertools import repeat from queue import LifoQueue, Queue from typing import Dict, Iterable, List, Union import numpy as np import torch +import yaml from torch.nn.utils.rnn import pad_sequence import lmdeploy -from lmdeploy.messages import (EngineGenerationConfig, EngineOutput, - ResponseType, TurbomindEngineConfig) +from lmdeploy.messages import (EngineOutput, GenerationConfig, ResponseType, + TurbomindEngineConfig) from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger, get_model -from .deploy.converter import SUPPORTED_FORMATS, get_tm_model -from .deploy.target_model.base import TurbomindModelConfig +from .deploy.config import TurbomindModelConfig from .supported_models import is_supported from .utils import ModelSource, get_model_source @@ -164,33 +165,53 @@ def _get_params(device_id, que): tm_params[k] = [] tm_params[k].append(v) + def _postprocess_config(self, tm_config, engine_config): + """postprocess turbomind config by.""" + import copy + self.config = copy.deepcopy(tm_config) + # Update the attribute values in `self.config` with the valid values + # from the corresponding attributes in `engine_config`, such as + # `session_len`, `quant_policy`, `rope_scaling_factor`, etc. + self.config.update_from_engine_config(engine_config) + + # update some attributes of `engine_config` which depends on + # `session_len` + self.engine_config = engine_config + if engine_config.max_prefill_token_num is not None \ + and engine_config.num_tokens_per_iter == 0: + self.engine_config.num_tokens_per_iter = \ + engine_config.max_prefill_token_num + self.engine_config.max_prefill_iters = ( + self.config.session_len + engine_config.max_prefill_token_num - + 1) // engine_config.max_prefill_token_num + + # pack `self.config` and `self.engine_config` into a dict + self.config_dict = self.config.to_dict() + self.config_dict.update(dict(engine_config=asdict(self.engine_config))) + logger.info(f'turbomind model config:\n\n' + f'{json.dumps(self.config_dict, indent=2)}') + def _from_hf(self, model_source: ModelSource, model_path: str, engine_config: TurbomindEngineConfig): """Load model which is in hf format.""" assert model_source == ModelSource.HF_MODEL, \ f'{model_source} is not supported' - if engine_config is None: - logger.warning('input engine config is None, using the default') - engine_config = TurbomindEngineConfig() - assert engine_config.model_format in SUPPORTED_FORMATS, \ - f'The model format should be in {SUPPORTED_FORMATS}' - assert is_supported(model_path), ( f'turbomind does not support {model_path}. ' 'Plz try pytorch engine instead.') - # convert transformers model into turbomind model format + # convert transformers model into turbomind model + from .deploy.converter import get_tm_model tm_model = get_tm_model(model_path, self.model_name, self.chat_template_name, engine_config) - self.config = tm_model.cfg - logger.info(f'model_config:\n\n{self.config.toini()}') + self._postprocess_config(tm_model.tm_config, engine_config) model_comm = _tm.AbstractTransformerModel.create_llama_model( model_dir='', - config=self.config.toini(), + config=yaml.safe_dump(self.config_dict), tensor_para_size=self.gpu_count, - data_type=self.config.weight_type) + data_type=self.config.model_config.weight_type) # create empty weight self._create_weight(model_comm) @@ -212,42 +233,27 @@ def _from_hf(self, model_source: ModelSource, model_path: str, def _from_workspace(self, model_path: str, engine_config: TurbomindEngineConfig): """Load model which is converted by `lmdeploy convert`""" - ini_path = osp.join(model_path, 'triton_models', 'weights', - 'config.ini') - # load cfg - with open(ini_path, 'r') as f: - parser = ConfigParser() - parser.read_file(f) - section_name = 'llama' - _cfg = parser._sections[section_name] + config_path = osp.join(model_path, 'triton_models', 'weights', + 'config.yaml') + # load TurbomindModelConfig from config file + with open(config_path, 'r') as f: + _cfg = yaml.safe_load(f) cfg = TurbomindModelConfig.from_dict(_cfg) # check whether input tp is valid + self.gpu_count = engine_config.tp if cfg.tensor_para_size != 1 and \ self.gpu_count != cfg.tensor_para_size: - logger.info(f'found tp={cfg.tensor_para_size} in config.ini.') + logger.info(f'found tp={cfg.tensor_para_size} in config.yaml.') self.gpu_count = cfg.tensor_para_size + engine_config.tp = self.gpu_count + + self._postprocess_config(cfg, engine_config) - if engine_config is not None: - engine_config.tp = cfg.tensor_para_size - if engine_config.rope_scaling_factor == 0: - # to avoid `rope_scaling_factor` from engine_config override - # the rope_scaling_factor in TurbomindModelConfig - engine_config.rope_scaling_factor = None - cfg.update_from_engine_config(engine_config) - if self.model_name: - cfg.model_name = self.model_name - if self.chat_template_name: - cfg.chat_template_name = self.chat_template_name - # update cfg - self.config = cfg - - # create model - logger.warning(f'model_config:\n\n{cfg.toini()}') weight_dir = osp.join(model_path, 'triton_models', 'weights') model_comm = _tm.AbstractTransformerModel.create_llama_model( model_dir=weight_dir, - config=cfg.toini(), + config=yaml.safe_dump(self.config_dict), tensor_para_size=self.gpu_count, data_type=self.config.weight_type) @@ -404,7 +410,7 @@ def end(self, session_id: int): input_ids, sequence_start=False, sequence_end=True, - gen_config=EngineGenerationConfig(max_new_tokens=0)): + gen_config=GenerationConfig(max_new_tokens=0)): pass async def async_end(self, session_id: int): @@ -421,7 +427,7 @@ def cancel(self, session_id: int): sequence_start=False, sequence_end=False, stop=True, - gen_config=EngineGenerationConfig(max_new_tokens=0)): + gen_config=GenerationConfig(max_new_tokens=0)): pass async def async_cancel(self, session_id: int): @@ -480,7 +486,7 @@ def prepare_embeddings(self, def prepare_inputs(self, session_id, input_ids, - gen_config: EngineGenerationConfig, + gen_config: GenerationConfig, input_embeddings=None, input_embedding_ranges=None, sequence_start: bool = True, @@ -551,13 +557,13 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): inputs['logprobs'] = _broadcast_np(gen_config.logprobs, np.int32) bad_words = [] - if gen_config.bad_words is not None: - bad_words.extend(gen_config.bad_words) + if gen_config.bad_token_ids is not None: + bad_words.extend(gen_config.bad_token_ids) if gen_config.ignore_eos: stop_words = None bad_words.append(self.eos_id) else: - stop_words = gen_config.stop_words + stop_words = gen_config.stop_token_ids stop_words = _construct_stop_or_bad_words(stop_words) bad_words = _construct_stop_or_bad_words(bad_words) @@ -580,7 +586,7 @@ async def async_stream_infer(self, sequence_end: bool = False, step=0, stop=False, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, stream_output=False, **kwargs): """Perform model inference. @@ -595,7 +601,7 @@ async def async_stream_infer(self, sequence_end (bool): indicator for ending a sequence step (int): the offset of the k/v cache stop (bool): indicator for cancelling the session - gen_config (EngineGenerationConfig): generation config + gen_config (GenerationConfig): generation config stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ @@ -663,8 +669,8 @@ async def async_stream_infer(self, outputs = EngineOutput(status, output[:-1].tolist(), len_ - 1) elif len(output) > 0 and \ - gen_config.stop_words is not None and \ - output[-1].item() in gen_config.stop_words: + gen_config.stop_token_ids is not None and \ + output[-1].item() in gen_config.stop_token_ids: outputs = EngineOutput(status, output[:-1].tolist(), len_) else: outputs = EngineOutput(status, output.tolist(), len_) @@ -697,7 +703,7 @@ def stream_infer(self, sequence_end: bool = False, step=0, stop=False, - gen_config: EngineGenerationConfig = None, + gen_config: GenerationConfig = None, stream_output=False, **kwargs): """Perform model inference. @@ -712,7 +718,7 @@ def stream_infer(self, sequence_end (bool): indicator for ending a sequence step (int): the offset of the k/v cache stop (bool): indicator for cancelling the session - gen_config (EngineGenerationConfig): generation config + gen_config (GenerationConfig): generation config stream_output (bool): indicator for stream output kwargs (dict): kwargs for backward compatibility """ @@ -776,8 +782,8 @@ def stream_infer(self, outputs = EngineOutput(status, output[:-1].tolist(), len_ - 1, out_logprobs) elif len(output) > 0 and \ - gen_config.stop_words is not None and \ - output[-1].item() in gen_config.stop_words: + gen_config.stop_token_ids is not None and \ + output[-1].item() in gen_config.stop_token_ids: outputs = EngineOutput(status, output[:-1].tolist(), len_, out_logprobs) else: diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py index de1bf04efb..206dd6d08c 100644 --- a/lmdeploy/utils.py +++ b/lmdeploy/utils.py @@ -200,6 +200,7 @@ def get_model(pretrained_model_name_or_path: str, download_kwargs['token'] = token model_path = snapshot_download(pretrained_model_name_or_path, + ignore_patterns=['*.pth'], **download_kwargs) return model_path diff --git a/requirements/runtime.txt b/requirements/runtime.txt index c6a1e74444..e7a55891e3 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -5,6 +5,7 @@ fire mmengine-lite numpy<2.0.0 openai +outlines peft<=0.11.1 pillow protobuf diff --git a/requirements/test.txt b/requirements/test.txt index d06440a9d7..607907dffd 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -3,6 +3,7 @@ coverage pynvml pytest pytest-assume +pytest-cov pytest-order pytest-rerunfailures pytest-sugar diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt index ac8c47d774..26c580714a 100644 --- a/src/turbomind/triton_backend/llama/CMakeLists.txt +++ b/src/turbomind/triton_backend/llama/CMakeLists.txt @@ -25,5 +25,5 @@ set(llama_triton_backend_files find_package(CUDAToolkit REQUIRED) add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files}) set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) -target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt) +target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt yaml-cpp::yaml-cpp) target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 58fec72e88..e2a564aa44 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -19,7 +19,6 @@ // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "3rdparty/INIReader.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaInstanceComm.h" #include "src/turbomind/models/llama/LlamaLinear.h" @@ -30,34 +29,39 @@ #include "src/turbomind/utils/cuda_utils.h" #include #include +#include namespace ft = turbomind; -std::shared_ptr AbstractTransformerModel::createLlamaModel(std::string inifile) +std::shared_ptr AbstractTransformerModel::createLlamaModel(std::string config_file) { - INIReader reader = INIReader(inifile); - if (reader.ParseError() < 0) { - std::cout << "[ERROR] Can't load '" << inifile << "'\n"; - return nullptr; + YAML::Node reader; + try { + reader = YAML::Load(config_file); + } + catch (const YAML::Exception& e) { + std::cerr << "Error reading YAML config: " << e.what() << std::endl; + ft::FT_CHECK(false); } - const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type"); - int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); - std::string model_dir = reader.Get("ft_instance_hyperparameter", "model_dir"); + const auto ft_instance_hyperparameter = reader["ft_instance_hyperparameter"]; + const std::string data_type = ft_instance_hyperparameter["data_type"].as(); + int tensor_para_size = ft_instance_hyperparameter["tensor_para_size"].as(); + std::string model_dir = ft_instance_hyperparameter["model_dir"].as(); if (data_type == "half" || data_type == "fp16") { return std::make_shared>( - reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + ft_instance_hyperparameter["tensor_para_size"].as(), + ft_instance_hyperparameter["pipeline_para_size"].as(), + ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), model_dir); } else if (data_type == "bf16") { #ifdef ENABLE_BF16 return std::make_shared>( - reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + ft_instance_hyperparameter["tensor_para_size"].as(), + ft_instance_hyperparameter["pipeline_para_size"].as(), + ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF16"); @@ -67,9 +71,9 @@ std::shared_ptr AbstractTransformerModel::createLlamaM else { #ifdef ENABLE_FP32 return std::make_shared>( - reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"), - reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0), + ft_instance_hyperparameter["tensor_para_size"].as(), + ft_instance_hyperparameter["pipeline_para_size"].as(), + ft_instance_hyperparameter["enable_custom_all_reduce"].as(0), model_dir); #else TM_LOG_ERROR("[ERROR] Turbomind is not built with ENABLE_BF32"); @@ -189,81 +193,81 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, weights_(ft::getDeviceCount()), enable_custom_all_reduce_(enable_custom_all_reduce) { - INIReader reader; FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options"); - if (!model_dir.empty()) { - model_dir_ = model_dir; - const std::string inifile{model_dir + "/config.ini"}; - reader = INIReader(inifile); - if (reader.ParseError() < 0) { - TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str()); - ft::FT_CHECK(false); + YAML::Node reader; + + try { + if (!model_dir.empty()) { + model_dir_ = model_dir; + const std::string config_file{model_dir + "/config.yaml"}; + reader = YAML::LoadFile(config_file); } - } - if (!config.empty()) { - std::FILE* tmpf = std::tmpfile(); - std::fputs(config.c_str(), tmpf); - std::rewind(tmpf); - reader = INIReader(tmpf); - if (reader.ParseError() < 0) { - TM_LOG_ERROR("[ERROR] Can't init with config %s", config.c_str()); - ft::FT_CHECK(false); + if (!config.empty()) { + reader = YAML::Load(config); } } + catch (const YAML::Exception& e) { + std::cerr << "Error reading YAML config: " << e.what() << std::endl; + ft::FT_CHECK(false); + } - model_name_ = reader.Get("llama", "model_name"); - model_param_.head_num = reader.GetInteger("llama", "head_num"); - model_param_.head_dim = reader.GetInteger("llama", "size_per_head"); - model_param_.kv_head_num = reader.GetInteger("llama", "kv_head_num", 0); - model_param_.hidden_units = reader.GetInteger("llama", "hidden_units"); - model_param_.layer_num = reader.GetInteger("llama", "num_layer"); - model_param_.inter_size = reader.GetInteger("llama", "inter_size"); - model_param_.vocab_size = reader.GetInteger("llama", "vocab_size"); - model_param_.norm_eps = reader.GetFloat("llama", "norm_eps"); - model_param_.start_id = reader.GetInteger("llama", "start_id"); - model_param_.end_id = reader.GetInteger("llama", "end_id"); - attn_param_.cache_block_seq_len = reader.GetInteger("llama", "cache_block_seq_len", 0); - model_param_.quant_policy = reader.GetInteger("llama", "quant_policy", 0); + const auto model_reader = reader["model_config"]; + const auto attention_reader = reader["attention_config"]; + const auto lora_reader = reader["lora_config"]; + const auto engine_reader = reader["engine_config"]; + + model_name_ = model_reader["model_name"].as(); + model_param_.head_num = model_reader["head_num"].as(); + model_param_.head_dim = model_reader["size_per_head"].as(); + model_param_.kv_head_num = model_reader["kv_head_num"].as(0); + model_param_.hidden_units = model_reader["hidden_units"].as(); + model_param_.layer_num = model_reader["num_layer"].as(); + model_param_.inter_size = model_reader["inter_size"].as(); + model_param_.vocab_size = model_reader["vocab_size"].as(); + model_param_.norm_eps = model_reader["norm_eps"].as(); + model_param_.start_id = model_reader["start_id"].as(); + model_param_.end_id = model_reader["end_id"].as(); + attn_param_.cache_block_seq_len = attention_reader["cache_block_seq_len"].as(0); + model_param_.quant_policy = engine_reader["quant_policy"].as(0); // Only weight classes need these - attn_bias_ = reader.GetInteger("llama", "attn_bias", 0); - group_size_ = reader.GetInteger("llama", "group_size", 0); + attn_bias_ = model_reader["attn_bias"].as(0); + group_size_ = model_reader["group_size"].as(0); // rotary embedding parameters - attn_param_.rotary_embedding_dim = reader.GetInteger("llama", "rotary_embedding"); - attn_param_.rotary_embedding_base = reader.GetFloat("llama", "rope_theta", 10000.0f); - attn_param_.rope_scaling_type = reader.Get("llama", "rope_scaling_type", ""); - attn_param_.rope_scaling_factor = reader.GetFloat("llama", "rope_scaling_factor", 0.f); - attn_param_.low_freq_factor = reader.GetFloat("llama", "low_freq_factor", 1.0); - attn_param_.high_freq_factor = reader.GetFloat("llama", "high_freq_factor", 1.0); - attn_param_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0); - attn_param_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0); - attn_param_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0); - - attn_param_.original_max_position_embeddings = reader.GetInteger("llama", "original_max_position_embeddings", 0); - - engine_param_.max_batch_size = reader.GetInteger("llama", "max_batch_size", 0); - engine_param_.max_prefill_token_num = reader.GetInteger("llama", "max_prefill_token_num", 0); - engine_param_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0); - engine_param_.session_len = reader.GetInteger("llama", "session_len", 0); - engine_param_.step_length = reader.GetInteger("llama", "step_length", 0); - - engine_param_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0); - engine_param_.cache_chunk_size = reader.GetInteger("llama", "cache_chunk_size", 0); - engine_param_.enable_prefix_caching = reader.GetBoolean("llama", "enable_prefix_caching", false); - - engine_param_.num_tokens_per_iter = reader.GetInteger("llama", "num_tokens_per_iter", 0); - engine_param_.max_prefill_iters = reader.GetInteger("llama", "max_prefill_iters", 1); - - lora_param_.policy = ft::getLoraPolicy(reader.Get("llama", "lora_policy", "")); - lora_param_.r = reader.GetInteger("llama", "lora_r", 0); - lora_param_.scale = reader.GetFloat("llama", "lora_scale", 0); - lora_param_.max_wo_r = reader.GetInteger("llama", "lora_max_wo_r", 0); - lora_param_.rank_pattern = getLoraPattern(reader.Get("llama", "lora_rank_pattern", ""), + attn_param_.rotary_embedding_dim = attention_reader["rotary_embedding"].as(); + attn_param_.rotary_embedding_base = attention_reader["rope_theta"].as(10000.0f); + attn_param_.rope_scaling_type = attention_reader["rope_scaling_type"].as(""); + attn_param_.rope_scaling_factor = attention_reader["rope_scaling_factor"].as(0.f); + attn_param_.low_freq_factor = attention_reader["low_freq_factor"].as(1.0); + attn_param_.high_freq_factor = attention_reader["high_freq_factor"].as(1.0); + attn_param_.max_position_embeddings = attention_reader["max_position_embeddings"].as(0); + attn_param_.use_dynamic_ntk = attention_reader["use_dynamic_ntk"].as(0); + attn_param_.use_logn_attn = attention_reader["use_logn_attn"].as(0); + + attn_param_.original_max_position_embeddings = attention_reader["original_max_position_embeddings"].as(0); + + engine_param_.max_batch_size = engine_reader["max_batch_size"].as(0); + engine_param_.max_prefill_token_num = engine_reader["max_prefill_token_num"].as(0); + engine_param_.max_context_token_num = engine_reader["max_context_token_num"].as(0); + engine_param_.session_len = model_reader["session_len"].as(0); + + engine_param_.cache_max_block_count = engine_reader["cache_max_entry_count"].as(0); + engine_param_.cache_chunk_size = engine_reader["cache_chunk_size"].as(0); + engine_param_.enable_prefix_caching = engine_reader["enable_prefix_caching"].as(false); + + engine_param_.num_tokens_per_iter = engine_reader["num_tokens_per_iter"].as(0); + engine_param_.max_prefill_iters = engine_reader["max_prefill_iters"].as(1); + + lora_param_.policy = ft::getLoraPolicy(reader["lora_config"]["lora_policy"].as("")); + lora_param_.r = lora_reader["lora_r"].as(0); + lora_param_.scale = lora_reader["lora_scale"].as(0); + lora_param_.max_wo_r = lora_reader["lora_max_wo_r"].as(0); + lora_param_.rank_pattern = getLoraPattern(lora_reader["lora_rank_pattern"].as(""), [](const std::string& s) { return std::stoi(s); }); - lora_param_.scale_pattern = getLoraPattern(reader.Get("llama", "lora_scale_pattern", ""), + lora_param_.scale_pattern = getLoraPattern(lora_reader["lora_scale_pattern"].as(""), [](const std::string& s) { return std::stof(s); }); handleMissingParams(); @@ -273,7 +277,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, const auto device_count = ft::getDeviceCount(); engines_.resize(device_count); - const std::string weight_type_str = reader.Get("llama", "weight_type"); + const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16") { weight_type_ = ft::WeightType::kFP16; } diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc index db783c5637..c13688ff3a 100644 --- a/src/turbomind/utils/cuda_utils.cc +++ b/src/turbomind/utils/cuda_utils.cc @@ -366,33 +366,6 @@ cudaError_t getSetDevice(int i_device, int* o_device) return cudaSuccess; } -FtCudaDataType getModelFileType(std::string ini_file, std::string section_name) -{ - FtCudaDataType model_file_type; - INIReader reader = INIReader(ini_file); - if (reader.ParseError() < 0) { - TM_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str()); - model_file_type = FtCudaDataType::FP32; - } - else { - std::string weight_data_type_str = std::string(reader.Get(section_name, "weight_data_type")); - if (weight_data_type_str.find("fp32") != std::string::npos) { - model_file_type = FtCudaDataType::FP32; - } - else if (weight_data_type_str.find("fp16") != std::string::npos) { - model_file_type = FtCudaDataType::FP16; - } - else if (weight_data_type_str.find("bf16") != std::string::npos) { - model_file_type = FtCudaDataType::BF16; - } - else { - TM_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str()); - model_file_type = FtCudaDataType::FP32; - } - } - return model_file_type; -} - bool is_16xx_series(const char* name) { const std::regex re(R"(GTX 16\d\d)"); diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index 533263604e..2148fcc164 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -16,11 +16,11 @@ #pragma once -#include "3rdparty/INIReader.h" #include "src/turbomind/macro.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/logger.h" +#include #include #include #include @@ -384,8 +384,6 @@ struct getTypeFromCudaDataType { }; #endif -FtCudaDataType getModelFileType(std::string ini_file, std::string section_name); - // clang-format off template struct packed_type; template <> struct packed_type { using type = float; }; // we don't need to pack float by default diff --git a/tests/test_lmdeploy/test_async_engine.py b/tests/test_lmdeploy/test_async_engine.py index 872b6b1abc..0123b2a43c 100644 --- a/tests/test_lmdeploy/test_async_engine.py +++ b/tests/test_lmdeploy/test_async_engine.py @@ -1,4 +1,3 @@ -import configparser import os import tempfile @@ -23,13 +22,12 @@ def test_get_names_from_turbomind_model(): os.makedirs(os.path.join(workspace, 'triton_models', 'weights'), exist_ok=True) - expected_chat_template = 'internlm2' - config = configparser.ConfigParser() - config.add_section('llama') - config.set('llama', 'chat_template', expected_chat_template) + import yaml - with open(f'{workspace}/triton_models/weights/config.ini', 'w') as f: - config.write(f) + expected_chat_template = 'internlm2' + config = dict(model_config=dict(chat_template=expected_chat_template)) + with open(f'{workspace}/triton_models/weights/config.yaml', 'w') as f: + yaml.safe_dump(config, f) _, chat_template = get_names_from_model(workspace) assert chat_template == expected_chat_template diff --git a/tests/test_lmdeploy/test_messages.py b/tests/test_lmdeploy/test_messages.py index f3d44355df..0453602c71 100644 --- a/tests/test_lmdeploy/test_messages.py +++ b/tests/test_lmdeploy/test_messages.py @@ -1,15 +1,13 @@ from typing import List -from lmdeploy import EngineGenerationConfig, GenerationConfig, Tokenizer +from lmdeploy import GenerationConfig, Tokenizer def test_engine_generation_config(): tokenizer = Tokenizer('internlm/internlm-chat-7b') config = GenerationConfig(n=3, stop_words=['']) - _config = EngineGenerationConfig.From(config, tokenizer) - - assert _config.n == config.n == 3 and \ - _config.max_new_tokens == config.max_new_tokens and \ - _config.temperature == config.temperature - assert isinstance(_config.stop_words, List) and \ - isinstance(_config.stop_words[0], int) + stop_token_ids = tokenizer.encode('', add_bos=False) + config.convert_stop_bad_words_to_ids(tokenizer) + assert stop_token_ids == config.stop_token_ids + assert isinstance(config.stop_token_ids, List) and \ + isinstance(config.stop_token_ids[0], int) diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py index 95b3e691a6..0d125fe74c 100644 --- a/tests/test_lmdeploy/test_turbomind/test_converter.py +++ b/tests/test_lmdeploy/test_turbomind/test_converter.py @@ -44,10 +44,10 @@ def test_registered_models(): output_name, config, _ = get_output_model_registered_name_and_config( model, model_format=model_format, group_size=0) assert output_name == register_name - assert config.group_size == group_size + assert config.model_config.group_size == group_size assert config.weight_type == weight_type assert config.session_len > 0 - assert config.model_arch is not None + assert config.model_config.model_arch is not None def test_update_from_engine_config(): @@ -61,26 +61,7 @@ def test_update_from_engine_config(): config = copy.deepcopy(_config) config.update_from_engine_config(TurbomindEngineConfig()) assert config.tensor_para_size == 1 - assert config.session_len == 32776 - assert config.max_batch_size == 128 - assert config.cache_max_entry_count == 0.8 - assert config.quant_policy == 0 - assert config.max_prefill_iters == 5 - assert config.num_tokens_per_iter == 8192 - - config = copy.deepcopy(_config) - config.update_from_engine_config( - TurbomindEngineConfig(max_prefill_token_num=2048, - num_tokens_per_iter=0)) - assert config.max_prefill_iters == 17 - assert config.num_tokens_per_iter == 2048 - - config = copy.deepcopy(_config) - config.update_from_engine_config( - TurbomindEngineConfig(max_prefill_token_num=2048, - num_tokens_per_iter=256)) - assert config.max_prefill_iters == 1 - assert config.num_tokens_per_iter == 256 + assert config.session_len == 32768 config = copy.deepcopy(_config) engine_config = TurbomindEngineConfig(model_format='hf', @@ -98,11 +79,9 @@ def test_update_from_engine_config(): assert (config.tensor_para_size == engine_config.tp) assert (config.session_len == engine_config.session_len) - assert (config.max_batch_size == engine_config.max_batch_size) + assert (config.attention_config.rope_scaling_factor == + engine_config.rope_scaling_factor) + assert (config.attention_config.rope_scaling_factor == + engine_config.rope_scaling_factor) assert ( - config.cache_max_entry_count == engine_config.cache_max_entry_count) - assert (config.quant_policy == engine_config.quant_policy) - assert (config.rope_scaling_factor == engine_config.rope_scaling_factor) - assert (config.use_logn_attn == engine_config.use_logn_attn) - assert (config.max_prefill_iters == engine_config.max_prefill_iters) - assert (config.num_tokens_per_iter == engine_config.num_tokens_per_iter) + config.attention_config.use_logn_attn == engine_config.use_logn_attn) diff --git a/tests/test_lmdeploy/test_utils.py b/tests/test_lmdeploy/test_utils.py index ebafdc2634..cdabf98a24 100644 --- a/tests/test_lmdeploy/test_utils.py +++ b/tests/test_lmdeploy/test_utils.py @@ -1,6 +1,8 @@ from transformers import AutoConfig -from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig +from lmdeploy.turbomind.deploy.config import (ModelConfig, + TurbomindModelConfig, + config_from_dict) from lmdeploy.utils import _get_and_verify_max_len @@ -20,7 +22,7 @@ def test_get_and_verify_max_len(): assert (_get_and_verify_max_len(config, 102400) == 102400) # with TurbomindModelConfig - config = TurbomindModelConfig.from_dict({}, allow_none=True) - config.session_len = 4096 + config = config_from_dict(TurbomindModelConfig, {}) + config.model_config = config_from_dict(ModelConfig, dict(session_len=4096)) assert (_get_and_verify_max_len(config, None) == config.session_len) assert (_get_and_verify_max_len(config, 1024) == 1024)