Skip to content

Commit

Permalink
refactor lmi, neuron, and trt dockerfiles to install python requireme…
Browse files Browse the repository at this point in the history
…nts from requirements file
  • Loading branch information
siddvenk committed Nov 12, 2024
1 parent 8481f56 commit 7dc354a
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 137 deletions.
80 changes: 13 additions & 67 deletions serving/docker/lmi.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,9 @@ FROM nvidia/cuda:$version
ARG cuda_version=cu124
ARG djl_version
ARG djl_serving_version
# Base Deps
ARG python_version=3.11
ARG torch_version=2.5.1
ARG torch_vision_version=0.20.1
ARG djl_torch_version=2.4.0
ARG onnx_version=1.19.0
ARG pydantic_version=2.9.2
ARG djl_converter_wheel="https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl"
# HF Deps
ARG protobuf_version=3.20.3
ARG transformers_version=4.45.2
ARG accelerate_version=1.0.1
ARG bitsandbytes_version=0.44.1
ARG optimum_version=1.23.2
ARG auto_gptq_version=0.7.1
ARG datasets_version=3.0.1
ARG autoawq_version=0.2.5
ARG tokenizers_version=0.20.1
# LMI-Dist Deps
ARG vllm_wheel="https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl"
ARG flash_infer_wheel="https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl"
# %2B is the url escape for the '+' character
ARG lmi_dist_wheel="https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl"
ARG seq_scheduler_wheel="https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl"
ARG peft_version=0.13.2

ARG sagemaker_fast_model_loader_wheel="https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl"

EXPOSE 8080

Expand Down Expand Up @@ -92,6 +68,9 @@ COPY partition /opt/djl/partition
COPY distribution[s]/ ./
RUN mv *.deb djl-serving_all.deb || true

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq libaio-dev libopenmpi-dev g++ \
&& scripts/install_openssh.sh \
&& scripts/install_djl_serving.sh $djl_version $djl_serving_version \
Expand All @@ -105,49 +84,16 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -yq libaio-
&& pip3 cache purge \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

RUN pip3 install torch==${torch_version} torchvision==${torch_vision_version} --index-url https://download.pytorch.org/whl/cu124
RUN pip3 install \
${seq_scheduler_wheel} \
peft==${peft_version} \
protobuf==${protobuf_version} \
transformers==${transformers_version} \
hf-transfer \
zstandard \
datasets==${datasets_version} \
mpi4py \
sentencepiece \
tiktoken \
blobfile \
einops \
accelerate==${accelerate_version} \
bitsandbytes==${bitsandbytes_version} \
auto-gptq==${auto_gptq_version} \
pandas \
pyarrow \
jinja2 \
retrying \
opencv-contrib-python-headless \
safetensors \
scipy \
onnx \
sentence_transformers \
onnxruntime \
autoawq==${autoawq_version} \
tokenizers==${tokenizers_version} \
pydantic==${pydantic_version} \
${djl_converter_wheel} \
optimum==${optimum_version} \
${flash_infer_wheel} \
${vllm_wheel} \
${lmi_dist_wheel} \
torch==${torch_version} \
torchvision==${torch_vision_version} \
${sagemaker_fast_model_loader_wheel} \
&& git clone https://github.com/neuralmagic/AutoFP8.git && cd AutoFP8 && git reset --hard 4b2092c && pip3 install . && cd .. && rm -rf AutoFP8 \
&& pip3 cache purge

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*
COPY requirements-lmi.txt ./requirements.txt
RUN pip3 install -r requirements.txt
# TODO: Migrate to llmcompressor, this repo is deprecated
RUN git clone https://github.com/neuralmagic/AutoFP8.git && \
cd AutoFP8 && \
git reset --hard 4b2092c && \
pip3 install . --no-deps && \
cd .. && \
rm -rf AutoFP8 && \
pip3 cache purge

RUN scripts/patch_oss_dlc.sh python \
&& scripts/security_patch.sh lmi \
Expand Down
28 changes: 6 additions & 22 deletions serving/docker/pytorch-inf2.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,9 @@ FROM ubuntu:22.04
ARG djl_version
ARG djl_serving_version
ARG torch_version=2.1.2
ARG torchvision_version=0.16.2
ARG python_version=3.10
ARG python_version=3.11
ARG neuronsdk_version=2.20.0
ARG torch_neuronx_version=2.1.2.2.3.0
ARG transformers_neuronx_version=0.12.313
ARG neuronx_distributed_version=0.9.0
ARG neuronx_cc_version=2.15.128.0
ARG neuronx_cc_stubs_version=2.15.128.0
ARG torch_xla_version=2.1.4
ARG transformers_version=4.45.2
ARG accelerate_version=0.29.2
ARG diffusers_version=0.28.2
ARG pydantic_version=2.6.1
ARG optimum_neuron_version=0.0.24
ARG huggingface_hub_version=0.25.2
# %2B is the url escape for the '+' character
ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.6.2%2Bnightly-py3-none-any.whl"
EXPOSE 8080
Expand All @@ -47,12 +35,12 @@ ENV TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers
ENV PYTORCH_KERNEL_CACHE_PATH=/tmp/.cache
ENV MODEL_LOADING_TIMEOUT=1200
ENV PREDICT_TIMEOUT=240
ENV NEURON_SDK_PATH=/usr/local/lib/python3.10/dist-packages/torch_neuronx/lib
ENV NEURON_SDK_PATH=/usr/local/lib/${python_version}/dist-packages/torch_neuronx/lib
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NEURON_SDK_PATH
ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib
ENV PYTORCH_LIBRARY_PATH=/usr/local/lib/${python_version}/dist-packages/torch/lib
ENV PYTORCH_EXTRA_LIBRARY_PATH=$NEURON_SDK_PATH/libtorchneuron.so
ENV PYTORCH_PRECXX11=true
ENV PYTORCH_VERSION=2.1.2
ENV PYTORCH_VERSION=${torch_version}
ENV JAVA_OPTS="-Xmx1g -Xms1g -Xss2m -XX:+ExitOnOutOfMemoryError"
ENV NEURON_CC_FLAGS="--logfile /tmp/compile.log --temp-dir=/tmp"
ENV SERVING_FEATURES=vllm,lmi-dist,tnx
Expand All @@ -70,18 +58,14 @@ RUN mkdir -p /opt/djl/conf && \
mkdir -p /opt/ml/model
COPY config.properties /opt/djl/conf/
COPY partition /opt/djl/partition
COPY requirements-neuron.txt ./requirements.txt
RUN mkdir -p /opt/djl/bin && cp scripts/telemetry.sh /opt/djl/bin && \
echo "${djl_serving_version} inf2" > /opt/djl/bin/telemetry && \
scripts/install_python.sh && \
scripts/install_djl_serving.sh $djl_version $djl_serving_version && \
scripts/install_djl_serving.sh $djl_version $djl_serving_version ${torch_version} && \
scripts/install_inferentia2.sh && \
pip install accelerate==${accelerate_version} safetensors torchvision==${torchvision_version} \
neuronx-cc==${neuronx_cc_version} torch-neuronx==${torch_neuronx_version} transformers-neuronx==${transformers_neuronx_version} \
torch_xla==${torch_xla_version} neuronx-cc-stubs==${neuronx_cc_stubs_version} huggingface-hub==${huggingface_hub_version} \
neuronx_distributed==${neuronx_distributed_version} protobuf sentencepiece jinja2 \
diffusers==${diffusers_version} opencv-contrib-python-headless Pillow --extra-index-url=https://pip.repos.neuron.amazonaws.com \
pydantic==${pydantic_version} optimum optimum-neuron==${optimum_neuron_version} tiktoken blobfile && \
pip install -r requirements.txt && \
pip install transformers==${transformers_version} ${vllm_wheel} && \
echo y | pip uninstall triton && \
scripts/install_s5cmd.sh x64 && \
Expand Down
42 changes: 42 additions & 0 deletions serving/docker/requirements-lmi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
torch==2.5.1
torchvision==0.20.1
# sequence scheduler for hf accelerate rolling batch
https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl
peft==0.13.2
protobuf==3.20.3
transformers==4.45.2
hf-transfer
zstandard
datasets==3.0.1
mpi4py
sentencepiece
tiktoken
blobfile
einops
accelerate==1.0.1
bitsandbytes==0.44.1
auto-gptq==0.7.1
pandas
pyarrow
jinja2
retrying
opencv-contrib-python-headless
safetensors
scipy
onnx
sentence_transformers
onnxruntime
autoawq==0.2.5
tokenizers==0.20.1
pydantic==2.9.2
# djl converter wheel for converting hf models to onnx/rust
https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl
optimum==1.23.2
# flashinfer for vllm
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
# vllm wheel using PT 2.5.1
https://publish.djl.ai/vllm/cu124-pt251/vllm-0.6.3.post1%2Bcu124-cp311-cp311-linux_x86_64.whl
# lmi-dist wheel - need to change this to the one built by ci once ready
https://publish.djl.ai/lmi_dist/lmi_dist-13.0.0-cp311-cp311-linux_x86_64.whl
# sagemaker fast model loader wheel
https://publish.djl.ai/fast-model-loader/sagemaker_fast_model_loader-0.1.0-cp311-cp311-linux_x86_64.whl
23 changes: 23 additions & 0 deletions serving/docker/requirements-neuron.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
--extra-index-url https://pip.repos.neuron.amazonaws.com
accelerate==0.29.2
safetensors
torchvision==0.16.2
neuronx-cc==2.15.128.0
torch-neuronx==2.1.2.2.3.0
transformers-neuronx==0.12.313
torch_xla==2.1.4
neuronx-cc-stubs==2.15.128.0
huggingface-hub==0.25.2
neuronx_distributed==0.9.0
protobuf
sentencepiece
jinja2
diffusers==0.28.2
opencv-contrib-python-headless
pillow
pydantic==2.6.1
optimum
optimum-neuron==0.0.25
tiktoken
blobfile
transformers
38 changes: 38 additions & 0 deletions serving/docker/requirements-trt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
torch==2.4.0
transformers==4.42.4
accelerate==0.32.1
peft==0.13.2
sentencepiece
mpi4py
cuda-python==12.5
onnx
polygraphy
pynvml==11.5.0
datasets==2.19.1
pydantic==2.6.1
scipy
torchprofile
bitsandbytes
ninja
transformers_stream_generator
einops
tiktoken
jinja2
graphviz
blobfile
colored
h5py
strenum
pulp
flax
easydict
tensorrt==10.3.0
janus==1.0.0
nvidia-modelopt==0.15.0
numpy==1.26.4
# TRTLLM toolkit wheel
https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-0.12.0%2Bnightly-py3-none-any.whl
# TRTLLM wheel - contains necessary patch fix not available in OSS
https://publish.djl.ai/tensorrt-llm/v0.12.0/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl
# Triton toolkit wheel - pybindings for trtllm
https://publish.djl.ai/tritonserver/r24.04/tritontoolkit-24.4-py310-none-any.whl
66 changes: 18 additions & 48 deletions serving/docker/tensorrt-llm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,8 @@ ARG version=12.5.1-devel-ubuntu22.04
FROM nvidia/cuda:$version
ARG cuda_version=cu125
ARG python_version=3.10
ARG TORCH_VERSION=2.4.0
ARG djl_version
ARG djl_serving_version
ARG transformers_version=4.44.2
ARG accelerate_version=0.32.1
ARG tensorrtlibs_version=10.1.0
# %2B is the url escape for the '+' character
ARG trtllm_toolkit_version=0.12.0%2Bnightly
ARG trtllm_version=v0.12.0
ARG cuda_python_version=12.5
ARG peft_version=0.10.0
ARG triton_version=r24.04
ARG trtllm_toolkit_wheel="https://publish.djl.ai/tensorrt-llm/toolkit/tensorrt_llm_toolkit-${trtllm_toolkit_version}-py3-none-any.whl"
ARG trtllm_wheel="https://publish.djl.ai/tensorrt-llm/${trtllm_version}/tensorrt_llm-0.12.0-cp310-cp310-linux_x86_64.whl"
ARG triton_toolkit_wheel="https://publish.djl.ai/tritonserver/${triton_version}/tritontoolkit-24.4-py310-none-any.whl"
ARG pydantic_version=2.6.1
ARG modelopt_version=0.15.0
ARG janus_version=1.0.0
ARG pynvml_verison=11.5.0
ARG numpy_version=1.26.4
ARG datasets_version=2.19.1

EXPOSE 8080

Expand Down Expand Up @@ -68,36 +49,30 @@ COPY partition /opt/djl/partition
COPY distribution[s]/ ./
RUN mv *.deb djl-serving_all.deb || true

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-5 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install OpenMPI and other deps
ARG DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y g++ wget unzip openmpi-bin libopenmpi-dev libffi-dev git-lfs rapidjson-dev graphviz && \
scripts/install_python.sh ${python_version} && \
pip3 cache purge && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install PyTorch
# Qwen needs transformers_stream_generator, tiktoken and einops
RUN pip install torch==${TORCH_VERSION} transformers==${transformers_version} accelerate==${accelerate_version} peft==${peft_version} sentencepiece \
mpi4py cuda-python==${cuda_python_version} onnx polygraphy pynvml==${pynvml_verison} datasets==${datasets_version} pydantic==${pydantic_version} scipy torchprofile bitsandbytes ninja \
transformers_stream_generator einops tiktoken jinja2 graphviz blobfile colored h5py strenum pulp flax easydict && \
pip3 cache purge

# Install TensorRT and TRT-LLM Deps
RUN pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt==${tensorrtlibs_version} janus==${janus_version} nvidia-modelopt==${modelopt_version} && \
pip install --no-deps ${trtllm_wheel} && \
pyver=$(echo $python_version | awk -F. '{print $1$2}') && \
pip3 cache purge

# download dependencies
RUN pip install ${triton_toolkit_wheel} ${trtllm_toolkit_wheel} && \
mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \
curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/${triton_version}/libtritonserver.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libtriton_tensorrtllm_common.so && \
curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/${trtllm_version}/libnvinfer_plugin_tensorrt_llm.so.10 && \
pip3 cache purge && \
# download trt/triton dependencies
RUN mkdir -p /opt/tritonserver/lib && mkdir -p /opt/tritonserver/backends/tensorrtllm && \
curl -o /opt/tritonserver/lib/libtritonserver.so https://publish.djl.ai/tritonserver/r24.04/libtritonserver.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm.so && \
curl -o /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm_common.so https://publish.djl.ai/tensorrt-llm/v0.12.0/libtriton_tensorrtllm_common.so && \
curl -o /opt/tritonserver/lib/libnvinfer_plugin_tensorrt_llm.so.10 https://publish.djl.ai/tensorrt-llm/v0.12.0/libnvinfer_plugin_tensorrt_llm.so.10 && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Install Python dependencies
COPY requirements-trt.txt ./requirements.txt
RUN pip3 install -r requirements.txt && pip3 cache purge
# TRT depends on transformers<=4.42.4, but we need a higher version for llama 3.1
RUN pip3 install transformers==4.44.2 --no-deps

# Final steps
RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version && \
scripts/install_s5cmd.sh x64 && \
Expand All @@ -108,13 +83,8 @@ RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version && \
useradd -m -d /home/djl djl && \
chown -R djl:djl /opt/djl && \
rm -rf scripts && \
pip3 install numpy==${numpy_version} && \
pip3 cache purge && \
apt-get clean -y && rm -rf /var/lib/apt/lists/*

# Add CUDA-Compat
RUN apt-get update && apt-get install -y cuda-compat-12-4 && apt-get clean -y && rm -rf /var/lib/apt/lists/*

LABEL maintainer="djl-dev@amazon.com"
LABEL dlc_major_version="1"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.djl.tensorrtllm="true"
Expand All @@ -123,7 +93,7 @@ LABEL com.amazonaws.sagemaker.capabilities.multi-models="true"
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
LABEL djl-version=$djl_version
LABEL djl-serving-version=$djl_serving_version
LABEL trtllm-version=$trtllm_version
LABEL trtllm-version=v0.12.0
LABEL cuda-version=$cuda_version
# To use the 535 CUDA driver
LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.2
LABEL com.amazonaws.sagemaker.inference.cuda.verified_versions=12.5

0 comments on commit 7dc354a

Please sign in to comment.