diff --git a/README.md b/README.md index 200dcc5269..84ead92e74 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ pip install torchserve-nightly torch-model-archiver-nightly torch-workflow-archi # Install dependencies python ./ts_scripts/install_dependencies.py -# Include depeendencies for accelerator support with the relevant optional flags +# Include dependencies for accelerator support with the relevant optional flags python ./ts_scripts/install_dependencies.py --rocm=rocm61 python ./ts_scripts/install_dependencies.py --cuda=cu121 diff --git a/docker/Dockerfile b/docker/Dockerfile index 94f4a1ba99..a8556b1fff 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -37,12 +37,12 @@ ARG BRANCH_NAME ARG REPO_URL=https://github.com/pytorch/serve.git ENV PYTHONUNBUFFERED TRUE -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository -y ppa:deadsnakes/ppa && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ ca-certificates \ g++ \ @@ -55,6 +55,13 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ git \ && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi + # Make the virtual environment and "activating" it by adding it first to the path. # From here on the python$PYTHON_VERSION interpreter is used and the packages # are installed in /home/venv which is what we need for the "runtime-image" @@ -67,6 +74,7 @@ RUN python -m pip install -U pip setuptools RUN export USE_CUDA=1 ARG USE_CUDA_VERSION="" +ARG USE_ROCM_VERSION="" COPY ./ serve @@ -76,7 +84,6 @@ RUN \ git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \ fi - WORKDIR "serve" RUN cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh @@ -90,6 +97,14 @@ RUN \ else \ python ./ts_scripts/install_dependencies.py;\ fi; \ + elif echo "${BASE_IMAGE}" | grep -q "rocm/"; then \ + # Install ROCm version specific binary when ROCm version is specified as a build arg + if [ "$USE_ROCM_VERSION" ]; then \ + python ./ts_scripts/install_dependencies.py --rocm $USE_ROCM_VERSION;\ + # Install the binary with the latest CPU image on a ROCm base image + else \ + python ./ts_scripts/install_dependencies.py; \ + fi; \ # Install the CPU binary else \ python ./ts_scripts/install_dependencies.py; \ @@ -111,13 +126,14 @@ FROM ${BASE_IMAGE} AS production-image # Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) ARG PYTHON_VERSION ENV PYTHONUNBUFFERED TRUE +ARG USE_ROCM_VERSION -RUN --mount=type=cache,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository ppa:deadsnakes/ppa -y && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python3-distutils \ @@ -130,6 +146,13 @@ RUN --mount=type=cache,target=/var/cache/apt \ && rm -rf /var/lib/apt/lists/* \ && cd /tmp +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi + RUN useradd -m model-server \ && mkdir -p /home/model-server/tmp @@ -137,6 +160,11 @@ COPY --chown=model-server --from=compile-image /home/venv /home/venv COPY --from=compile-image /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh ENV PATH="/home/venv/bin:$PATH" +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \ && chown -R model-server /home/model-server @@ -157,13 +185,14 @@ FROM ${BASE_IMAGE} AS ci-image ARG PYTHON_VERSION ARG BRANCH_NAME ENV PYTHONUNBUFFERED TRUE +ARG USE_ROCM_VERSION -RUN --mount=type=cache,target=/var/cache/apt \ +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ add-apt-repository -y ppa:deadsnakes/ppa && \ - apt remove python-pip python3-pip && \ + apt remove -y python-pip python3-pip && \ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ python$PYTHON_VERSION \ python3-distutils \ @@ -183,6 +212,12 @@ RUN --mount=type=cache,target=/var/cache/apt \ && rm -rf /var/lib/apt/lists/* \ && cd /tmp +RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ + if [ "$USE_ROCM_VERSION" ]; then \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y rocm-dev amd-smi-lib \ + && rm -rf /var/lib/apt/lists/* ; \ + fi COPY --from=compile-image /home/venv /home/venv @@ -190,6 +225,11 @@ ENV PATH="/home/venv/bin:$PATH" RUN python -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + RUN mkdir /home/serve ENV TS_RUN_IN_DOCKER True @@ -203,11 +243,13 @@ ARG PYTHON_VERSION ARG BRANCH_NAME ARG BUILD_FROM_SRC ARG LOCAL_CHANGES +ARG USE_ROCM_VERSION ARG BUILD_WITH_IPEX ARG IPEX_VERSION=1.11.0 ARG IPEX_URL=https://software.intel.com/ipex-whl-stable ENV PYTHONUNBUFFERED TRUE -RUN --mount=type=cache,target=/var/cache/apt \ + +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ apt-get update && \ apt-get upgrade -y && \ apt-get install software-properties-common -y && \ @@ -243,10 +285,17 @@ RUN \ COPY --from=compile-image /home/venv /home/venv ENV PATH="/home/venv/bin:$PATH" + +RUN \ + if [ "$USE_ROCM_VERSION" ]; then \ + python -m pip install /opt/rocm/share/amd_smi; \ + fi + WORKDIR "serve" + RUN python -m pip install -U pip setuptools \ && python -m pip install --no-cache-dir -r requirements/developer.txt \ - && python ts_scripts/install_from_src.py \ + && python ts_scripts/install_from_src.py --environment=dev\ && useradd -m model-server \ && mkdir -p /home/model-server/tmp \ && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \ diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm deleted file mode 100644 index a6f578ecb4..0000000000 --- a/docker/Dockerfile.rocm +++ /dev/null @@ -1,320 +0,0 @@ -# syntax = docker/dockerfile:experimental -# -# This file can build images for cpu and gpu env. By default it builds image for CPU. -# Use following option to build image for cuda/GPU: --build-arg BASE_IMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 -# Here is complete command for GPU/cuda - -# $ DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 -t torchserve:latest . -# -# Following comments have been shamelessly copied from https://github.com/pytorch/pytorch/blob/master/Dockerfile -# -# NOTE: To build this you will need a docker version > 18.06 with -# experimental enabled and DOCKER_BUILDKIT=1 -# -# If you do not use buildkit you are not going to have a good time -# -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ - -ARG BASE_IMAGE=ubuntu:24.04 -ARG BRANCH_NAME=master -# Note: -# Define here the default python version to be used in all later build-stages as default. -# ARG and ENV variables do not persist across stages (they're build-stage scoped). -# That is crucial for ARG PYTHON_VERSION, which otherwise becomes "" leading to nasty bugs, -# that don't let the build fail, but break current version handling logic and result -# in images with wrong python version. To fix that, we will restate the ARG PYTHON_VERSION -# on each build-stage. -ARG PYTHON_VERSION=3.11 - -FROM ${BASE_IMAGE} AS compile-image -ARG BASE_IMAGE=ubuntu:24.04 -ARG PYTHON_VERSION -ARG BUILD_NIGHTLY -ARG BUILD_FROM_SRC -ARG LOCAL_CHANGES -ARG BRANCH_NAME -ARG REPO_URL=https://github.com/pytorch/serve.git -ENV PYTHONUNBUFFERED TRUE - -RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get install software-properties-common -y && \ - add-apt-repository -y ppa:deadsnakes/ppa && \ - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - ca-certificates \ - g++ \ - python3-setuptools \ - python$PYTHON_VERSION \ - python$PYTHON_VERSION-dev \ - python$PYTHON_VERSION-venv \ - openjdk-17-jdk \ - curl \ - wget \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Make the virtual environment and "activating" it by adding it first to the path. -# From here on the python$PYTHON_VERSION interpreter is used and the packages -# are installed in /home/venv which is what we need for the "runtime-image" -RUN python$PYTHON_VERSION -m venv /home/venv -ENV PATH="/home/venv/bin:$PATH" - -ARG USE_ROCM_VERSION="" - -COPY ./ serve - -RUN \ - if echo "$LOCAL_CHANGES" | grep -q "false"; then \ - rm -rf /serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME /serve; \ - fi - -WORKDIR "/serve" - -RUN cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh - -RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - apt-get update \ - && wget https://repo.radeon.com/amdgpu-install/6.2.2/ubuntu/noble/amdgpu-install_6.2.60202-1_all.deb \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y ./amdgpu-install_6.2.60202-1_all.deb \ - && apt-get update \ - && apt-get install --no-install-recommends -y amdgpu-dkms rocm; \ - else \ - echo "Skip ROCm installation"; \ - fi - -RUN \ - # Install ROCm version specific binary when ROCm version is specified as a build arg - if [ "$USE_ROCM_VERSION" ]; then \ - python$PYTHON_VERSION ./ts_scripts/install_dependencies.py --rocm $USE_ROCM_VERSION; \ - # Install the binary with the latest CPU image on a ROCm base image - else \ - python$PYTHON_VERSION ./ts_scripts/install_dependencies.py;\ - fi; - -# Make sure latest version of torchserve is uploaded before running this -RUN \ - if echo "$BUILD_FROM_SRC" | grep -q "true"; then \ - python$PYTHON_VERSION -m pip install -r requirements/developer.txt \ - && python$PYTHON_VERSION ts_scripts/install_from_src.py;\ - elif echo "$BUILD_NIGHTLY" | grep -q "false"; then \ - python$PYTHON_VERSION -m pip install --no-cache-dir torchserve torch-model-archiver torch-workflow-archiver;\ - else \ - python$PYTHON_VERSION -m pip install --no-cache-dir torchserve-nightly torch-model-archiver-nightly torch-workflow-archiver-nightly;\ - fi - -# Final image for production -FROM ${BASE_IMAGE} AS production-image -# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) -ARG PYTHON_VERSION -ENV PYTHONUNBUFFERED TRUE -ARG USE_ROCM_VERSION - -RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get install software-properties-common -y && \ - add-apt-repository ppa:deadsnakes/ppa -y && \ - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - python$PYTHON_VERSION \ - python3-setuptools \ - python$PYTHON_VERSION-dev \ - python$PYTHON_VERSION-venv \ - # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package - # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905 - openjdk-17-jdk \ - build-essential \ - wget \ - && rm -rf /var/lib/apt/lists/* \ - && cd /tmp - -RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - apt-get update \ - && wget https://repo.radeon.com/amdgpu-install/6.2.2/ubuntu/noble/amdgpu-install_6.2.60202-1_all.deb \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y ./amdgpu-install_6.2.60202-1_all.deb \ - && apt-get update \ - && apt-get install --no-install-recommends -y amdgpu-dkms rocm; \ - else \ - echo "Skip ROCm installation"; \ - fi - -RUN useradd -m model-server \ - && mkdir -p /home/model-server/tmp - -COPY --chown=model-server --from=compile-image /home/venv /home/venv -COPY --from=compile-image /usr/local/bin/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh -ENV PATH="/home/venv/bin:$PATH" - -RUN \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - python$PYTHON_VERSION -m pip install -U pip setuptools \ - && python -m pip install /opt/rocm/share/amd_smi; \ - fi - -RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \ - && chown -R model-server /home/model-server - -COPY docker/config.properties /home/model-server/config.properties -RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store - -EXPOSE 8080 8081 8082 7070 7071 - -USER model-server -WORKDIR /home/model-server -ENV TEMP=/home/model-server/tmp -ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] -CMD ["serve"] - -# Final image for docker regression -FROM ${BASE_IMAGE} AS ci-image -# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) -ARG PYTHON_VERSION -ARG BRANCH_NAME -ARG USE_ROCM_VERSION -ENV PYTHONUNBUFFERED TRUE - -RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get install software-properties-common -y && \ - add-apt-repository -y ppa:deadsnakes/ppa && \ - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - python$PYTHON_VERSION \ - python3-setuptools \ - python$PYTHON_VERSION-dev \ - python$PYTHON_VERSION-venv \ - # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package - # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905 - openjdk-17-jdk \ - build-essential \ - wget \ - numactl \ - nodejs \ - npm \ - zip \ - unzip \ - && npm install -g newman@5.3.2 newman-reporter-htmlextra markdown-link-check \ - && rm -rf /var/lib/apt/lists/* \ - && cd /tmp - -RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - apt-get update \ - && wget https://repo.radeon.com/amdgpu-install/6.2.2/ubuntu/noble/amdgpu-install_6.2.60202-1_all.deb \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y ./amdgpu-install_6.2.60202-1_all.deb \ - && apt-get update \ - && apt-get install --no-install-recommends -y amdgpu-dkms rocm; \ - else \ - echo "Skip ROCm installation"; \ - fi - -COPY --from=compile-image /home/venv /home/venv - -ENV PATH="/home/venv/bin:$PATH" - -RUN \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - python$PYTHON_VERSION -m pip install -U pip setuptools \ - && python -m pip install /opt/rocm/share/amd_smi; \ - fi - -RUN python$PYTHON_VERSION -m pip install --no-cache-dir -r https://raw.githubusercontent.com/pytorch/serve/$BRANCH_NAME/requirements/developer.txt - -RUN mkdir /serve -ENV TS_RUN_IN_DOCKER True - -WORKDIR /serve -CMD ["python", "test/regression_tests.py"] - -#Final image for developer Docker image -FROM ${BASE_IMAGE} as dev-image -# Re-state ARG PYTHON_VERSION to make it active in this build-stage (uses default define at the top) -ARG PYTHON_VERSION -ARG BRANCH_NAME -ARG BUILD_FROM_SRC -ARG USE_ROCM_VERSION -ARG LOCAL_CHANGES -ARG BUILD_WITH_IPEX -ARG IPEX_VERSION=1.11.0 -ARG IPEX_URL=https://software.intel.com/ipex-whl-stable -ENV PYTHONUNBUFFERED TRUE -RUN --mount=type=cache,sharing=locked,target=/var/cache/apt \ - apt-get update && \ - apt-get upgrade -y && \ - apt-get install software-properties-common -y && \ - add-apt-repository -y ppa:deadsnakes/ppa && \ - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - fakeroot \ - ca-certificates \ - dpkg-dev \ - sudo \ - g++ \ - git \ - python$PYTHON_VERSION \ - python$PYTHON_VERSION-dev \ - python3-setuptools \ - python$PYTHON_VERSION-venv \ - # using openjdk-17-jdk due to circular dependency(ca-certificates) bug in openjdk-17-jre-headless debian package - # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1009905 - openjdk-17-jdk \ - build-essential \ - wget \ - curl \ - vim \ - numactl \ - nodejs \ - npm \ - zip \ - unzip \ - && npm install -g newman@5.3.2 newman-reporter-htmlextra markdown-link-check \ - && rm -rf /var/lib/apt/lists/* - -RUN --mount=type=cache,sharing=locked,id=apt-dev,target=/var/cache/apt \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - apt-get update \ - && wget https://repo.radeon.com/amdgpu-install/6.2.2/ubuntu/noble/amdgpu-install_6.2.60202-1_all.deb \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y ./amdgpu-install_6.2.60202-1_all.deb \ - && apt-get update \ - && apt-get install --no-install-recommends -y amdgpu-dkms rocm; \ - else \ - echo "Skip ROCm installation"; \ - fi - -COPY ./ serve - -RUN \ - if echo "$LOCAL_CHANGES" | grep -q "false"; then \ - rm -rf /serve;\ - git clone --recursive $REPO_URL -b $BRANCH_NAME /serve; \ - fi - -COPY --from=compile-image /home/venv /home/venv -ENV PATH="/home/venv/bin:$PATH" - -RUN \ - if [ -n "$USE_ROCM_VERSION" ]; then \ - python$PYTHON_VERSION -m pip install -U pip setuptools \ - && python -m pip install /opt/rocm/share/amd_smi; \ - fi - -WORKDIR "serve" - -RUN python$PYTHON_VERSION -m pip install -U pip setuptools \ - && python$PYTHON_VERSION -m pip install --no-cache-dir -r requirements/developer.txt \ - && python$PYTHON_VERSION ts_scripts/install_from_src.py --environment=dev \ - && useradd -m model-server \ - && mkdir -p /home/model-server/tmp \ - && cp docker/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh \ - && chmod +x /usr/local/bin/dockerd-entrypoint.sh \ - && chown -R model-server /home/model-server \ - && cp docker/config.properties /home/model-server/config.properties \ - && mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store \ - && chown -R model-server /home/venv -EXPOSE 8080 8081 8082 7070 7071 -WORKDIR /home/model-server -ENV TEMP=/home/model-server/tmp -ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] -CMD ["serve"] diff --git a/docker/README.md b/docker/README.md index 9e5ca8a229..f7b5e987f1 100644 --- a/docker/README.md +++ b/docker/README.md @@ -44,6 +44,7 @@ Use `build_image.sh` script to build the docker images. The script builds the `p |-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, ci| |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.| |-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. `cu121`, Default `cu121`| +|-rv, --rocmversion| Specify to rocm version to use. Supported values `rocm60`, `rocm61`, `rocm62` | |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.| |-cpp, --build-cpp specify to build TorchServe CPP| |-n, --nightly| Specify to build with TorchServe nightly.| @@ -62,9 +63,9 @@ Creates a docker image with publicly available `torchserve` and `torch-model-arc ./build_image.sh ``` - - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118` + - To create a GPU based image with cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118` for CUDA and `rocm60`, `rocm61`, `rocm62` for ROCm. - - GPU images are built with NVIDIA CUDA base image. If you want to use ONNX, please specify the base image as shown in the next section. + - GPU images are built with either NVIDIA CUDA base image or AMD ROCm base image. If you want to use ONNX, please specify the base image as shown in the next section. ```bash ./build_image.sh -g -cv cu117 @@ -132,6 +133,24 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr ./build_image.sh -bt dev -g -cv cu92 ``` +- For creating GPU based image with rocm version 6.0: + +```bash +./build_image.sh -bt dev -g -rv rocm60 +``` + +- For creating GPU based image with rocm version 6.1: + +```bash +./build_image.sh -bt dev -g -rv rocm61 +``` + +- For creating GPU based image with rocm version 6.2: + +```bash +./build_image.sh -bt dev -g -rv rocm62 +``` + - For creating GPU based image with a different branch: ```bash @@ -164,7 +183,7 @@ Creates a docker image with `torchserve` and `torch-model-archiver` installed fr ./build_image.sh -bt dev -g [-cv cu121|cu118] -cpp ``` -- For ROCm support (*experimental*), refer to [this documentation](../docs/hardware_support/amd_support.md). +- For more ROCm support (*experimental*), refer to [this documentation](../docs/hardware_support/amd_support.md). ## Start a container with a TorchServe image @@ -204,6 +223,12 @@ For GPU latest image with gpu devices 1 and 2: docker run --rm -it --gpus '"device=1,2"' -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu ``` +For GPU with ROCm support with gpu devices 1 and 2: + +```bash +docker run --rm -it --device=/dev/kfd --device=/dev/dri -e HIP_VISIBLE_DEVICES=1,2 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:7070:7070 -p 127.0.0.1:7071:7071 pytorch/torchserve:latest-gpu +``` + For specific versions you can pass in the specific tag to use (ex: `0.1.1-cuda10.1-cudnn7-runtime`): ```bash diff --git a/docker/build_image.sh b/docker/build_image.sh index b5b9f8e87e..2b25275374 100755 --- a/docker/build_image.sh +++ b/docker/build_image.sh @@ -11,6 +11,7 @@ BASE_IMAGE="ubuntu:20.04" UPDATE_BASE_IMAGE=false USE_CUSTOM_TAG=false CUDA_VERSION="" +ROCM_VERSION="" USE_LOCAL_SERVE_FOLDER=false BUILD_WITH_IPEX=false BUILD_CPP=false @@ -33,6 +34,7 @@ do echo "-bi, --baseimage specify base docker image. Example: nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04 " echo "-bt, --buildtype specify for type of created image. Possible values: production, dev, ci." echo "-cv, --cudaversion specify to cuda version to use" + echo "-rv, --rocmversion spesify to rocm version to use" echo "-t, --tag specify tag name for docker image" echo "-lf, --use-local-serve-folder specify this option for the benchmark image if the current 'serve' folder should be used during automated benchmarks" echo "-ipex, --build-with-ipex specify to build with intel_extension_for_pytorch" @@ -167,6 +169,24 @@ do shift shift ;; + -rv|--rocmversion) + ROCM_VERSION="$2" + if [ "${ROCM_VERSION}" == "rocm60" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2" + elif [ "${ROCM_VERSION}" == "rocm61" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2" + elif [ "${ROCM_VERSION}" == "rocm62" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4" + else + echo "ROCM version not supported" + exit 1 + fi + shift + shift + ;; esac done @@ -218,30 +238,47 @@ then exit 1 fi fi + + if [[ "${MACHINE}" == "gpu" || "${ROCM_VERSION}" != "" ]]; + then + if [ "${ROCM_VERSION}" == "rocm60" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.0.2" + elif [ "${ROCM_VERSION}" == "rocm61" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.1.2" + elif [ "${ROCM_VERSION}" == "rocm62" ]; + then + BASE_IMAGE="rocm/dev-ubuntu-22.04:6.2.4" + else + echo "ROCm version $ROCM_VERSION is not supported for CPP" + exit 1 + fi + fi fi if [ "${BUILD_TYPE}" == "production" ]; then if [ "${MULTI}" == "true" ]; then - DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\ --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --platform "${ARCH}" --target production-image ../ --push else - DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker buildx build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\ --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target production-image ../ --load fi elif [ "${BUILD_TYPE}" == "ci" ]; then - DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}"\ --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}" -t "${DOCKER_TAG}" --target ci-image ../ else if [ "${BUILD_CPP}" == "true" ] then - DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile.cpp --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" -t "${DOCKER_TAG}" --target cpp-dev-image . else - DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ + DOCKER_BUILDKIT=1 docker build --file Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg USE_CUDA_VERSION="${CUDA_VERSION}" --build-arg USE_ROCM_VERSION="${ROCM_VERSION}" --build-arg PYTHON_VERSION="${PYTHON_VERSION}"\ --build-arg BUILD_NIGHTLY="${BUILD_NIGHTLY}" --build-arg BRANCH_NAME="${BRANCH_NAME}" --build-arg REPO_URL="${REPO_URL}" --build-arg BUILD_FROM_SRC="${BUILD_FROM_SRC}" --build-arg LOCAL_CHANGES="${LOCAL_CHANGES}"\ --build-arg BUILD_WITH_IPEX="${BUILD_WITH_IPEX}" -t "${DOCKER_TAG}" --target dev-image ../ fi diff --git a/docs/hardware_support/amd_support.md b/docs/hardware_support/amd_support.md index 55de40f6d4..7029b2f36e 100644 --- a/docs/hardware_support/amd_support.md +++ b/docs/hardware_support/amd_support.md @@ -5,7 +5,7 @@ TorchServe can be run on any combination of operating system and device that is ## Supported Versions of ROCm -The current stable `major.patch` version of ROCm and the previous path version will be supported. For example version `N.2` and `N.1` where `N` is the current major version. +The current stable `major.patch` version of ROCm and the previous patch version will be supported. For example version `N.2` and `N.1` where `N` is the current major version. ## Installation @@ -35,7 +35,7 @@ The current stable `major.patch` version of ROCm and the previous path version w - install the dependencies needed for ROCm support. ```bash - python ./ts_scripts/install_dependencies.py --rocm=rocm61 + python ./ts_scripts/install_dependencies.py --rocm=rocm62 python ./ts_scripts/install_from_src.py ``` - enable amd-smi in the python virtual environment @@ -60,18 +60,31 @@ If you have 8 accelerators but only want TorchServe to see the last four of them > ⚠️ Setting both `CUDA_VISIBLE_DEVICES` and `HIP_VISIBLE_DEVICES` may cause unintended behaviour and should be avoided. > Doing so may cause an exception in the future. -## Docker +## Docker¨ **In Development** -`Dockerfile.rocm` provides preliminary ROCm support for TorchServe. +`Dockerfile` and `build_image.sh` provides ROCm support for TorchServe. Building and running `dev-image`: ```bash -docker build --file docker/Dockerfile.rocm --target dev-image -t torch-serve-dev-image-rocm --build-arg USE_ROCM_VERSION=rocm62 --build-arg BUILD_FROM_SRC=true . +./build_image.sh -bt dev -g -rv rocm62 -t torch-serve-dev-image-rocm +docker run -it --rm -device=/dev/kfd --device=/dev/dri torch-serve-dev-image-rocm bash +``` + +Building and running `ci-image`: -docker run -it --rm --device=/dev/kfd --device=/dev/dri torch-serve-dev-image-rocm bash +```bash +./build_image.sh -bt ci -g -rv rocm62 -t torch-serve-ci-image-rocm +docker run -it --rm --device=/dev/kfd --device=/dev/dri torch-serve-ci-image-rocm +``` + +Building and running `production-image`: + +```bash +./build_image.sh -bt production -g -rv rocm62 -t torch-serve-production-image-rocm +docker run -it --rm --device=/dev/kfd --device=/dev/dri torch-serve-production-image-rocm ``` ## Example Usage diff --git a/ts_scripts/api_utils.py b/ts_scripts/api_utils.py index 02e1fa4bc3..b038754d51 100755 --- a/ts_scripts/api_utils.py +++ b/ts_scripts/api_utils.py @@ -2,6 +2,7 @@ import os import shutil import sys +import time REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") sys.path.append(REPO_ROOT) @@ -108,10 +109,39 @@ def cleanup_model_store(): os.remove(f) -def move_logs(log_file, artifact_dir): +def move_logs(log_file, artifact_dir, retries=5): + """ + Move log files to artifacts directory. If directory already exists, merge contents. + """ logs_dir = os.path.join("logs") - os.rename(log_file, os.path.join(logs_dir, log_file)) # mv file logs/ - os.rename(logs_dir, os.path.join(artifact_dir, logs_dir)) # mv logs/ dir + + if not os.path.exists(logs_dir): + os.makedirs(logs_dir) + + shutil.move(log_file, os.path.join(logs_dir, log_file)) # mv file logs/ + + destination_dir = os.path.join(artifact_dir, logs_dir) + + # Retry is used because the directory might not be ready to be moved. + for attempt in range(retries): + try: + if os.path.exists(destination_dir): + # Merge contents if destination directory already exists + for root, dirs, files in os.walk(logs_dir): + for file in files: + shutil.move( + os.path.join(root, file), + os.path.join(destination_dir, file), + ) + shutil.rmtree(logs_dir) # Remove the empty logs directory + else: + shutil.move(logs_dir, destination_dir) # mv logs/ dir + break + except: + if attempt < retries - 1: + time.sleep(2) + else: + raise def trigger_management_tests():