From d602ff3af45eb1227cbfb67a0d806c9a49439443 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 21 Nov 2024 11:08:04 -0800 Subject: [PATCH 01/20] Test docker hub ubuntu24.04 --- .github/container/Dockerfile.base | 12 +++++++++--- .github/container/Dockerfile.equinox | 4 ++-- .github/container/Dockerfile.levanter | 2 +- .github/container/Dockerfile.maxtext.amd64 | 4 ++-- .github/container/Dockerfile.maxtext.arm64 | 8 ++++---- .github/container/Dockerfile.mjx | 4 ++-- .github/container/Dockerfile.pax.amd64 | 4 ++-- .github/container/Dockerfile.pax.arm64 | 10 +++++----- .github/container/Dockerfile.t5x.amd64 | 4 ++-- .github/container/Dockerfile.t5x.arm64 | 6 +++--- 10 files changed, 32 insertions(+), 26 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 50fda91a2..d0bdab868 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG CLANG_VERSION=18 @@ -53,12 +53,14 @@ apt_packages=( liblzma-dev python-is-python3 python3-pip + python3-venv rsync vim wget jq # llvm.sh - lsb-release software-properties-common + lsb-release + software-properties-common # GCP autoconfig pciutils hwloc bind9-host ) @@ -127,7 +129,11 @@ git apply Date: Thu, 21 Nov 2024 22:31:06 -0800 Subject: [PATCH 02/20] Adobt build for ubuntu-24.04 --- .github/container/Dockerfile.jax | 2 +- .github/container/Dockerfile.levanter | 2 +- ...rfile.maxtext.arm64 => Dockerfile.maxtext} | 32 ++++++--- .github/container/Dockerfile.maxtext.amd64 | 34 ---------- .../{Dockerfile.pax.arm64 => Dockerfile.pax} | 68 ++++++++++++------- .github/container/Dockerfile.pax.amd64 | 53 --------------- .../{Dockerfile.t5x.arm64 => Dockerfile.t5x} | 29 ++++---- .github/container/Dockerfile.t5x.amd64 | 43 ------------ .github/workflows/_build.yaml | 2 +- .github/workflows/_ci.yaml | 6 +- README.md | 6 +- 11 files changed, 90 insertions(+), 187 deletions(-) rename .github/container/{Dockerfile.maxtext.arm64 => Dockerfile.maxtext} (71%) delete mode 100644 .github/container/Dockerfile.maxtext.amd64 rename .github/container/{Dockerfile.pax.arm64 => Dockerfile.pax} (78%) delete mode 100644 .github/container/Dockerfile.pax.amd64 rename .github/container/{Dockerfile.t5x.arm64 => Dockerfile.t5x} (81%) delete mode 100644 .github/container/Dockerfile.t5x.amd64 diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 726656a7a..95695dfd1 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -36,7 +36,7 @@ RUN --mount=type=ssh \ --mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \ <<"EOF" bash -ex git-clone.sh ${URLREF_JAX} ${SRC_PATH_JAX} - sed 's/^numpy.*/numpy<2.0.0/' ${SRC_PATH_JAX}/build/requirements.in + sed -i 's/^numpy.*/numpy<2.0.0/' ${SRC_PATH_JAX}/build/requirements.in git-clone.sh ${URLREF_XLA} ${SRC_PATH_XLA} EOF diff --git a/.github/container/Dockerfile.levanter b/.github/container/Dockerfile.levanter index 2a68893b2..413c81f4c 100644 --- a/.github/container/Dockerfile.levanter +++ b/.github/container/Dockerfile.levanter @@ -34,6 +34,6 @@ COPY levanter-cache-warn.sh /opt/nvidia/entrypoint.d/ ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit as final +FROM mealkit AS final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.maxtext.arm64 b/.github/container/Dockerfile.maxtext similarity index 71% rename from .github/container/Dockerfile.maxtext.arm64 rename to .github/container/Dockerfile.maxtext index 0411a4bf3..be1324e6d 100644 --- a/.github/container/Dockerfile.maxtext.arm64 +++ b/.github/container/Dockerfile.maxtext @@ -2,7 +2,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT=/opt/maxtext ARG SRC_PATH_TFTEXT=/opt/tensorflow-text @@ -11,24 +11,25 @@ ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} AS wheel-builder +FROM ${BASE_IMAGE} as wheel-builder #------------------------------------------------------------------------------ # build tensorflow-text from source #------------------------------------------------------------------------------ -FROM wheel-builder AS tftext-builder +FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT + +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 +RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to @@ -38,14 +39,13 @@ echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scr ./oss_scripts/run_build.sh EOF - ############################################################################### ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} AS mealkit +FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.18.0 ARG SRC_PATH_MAXTEXT ARG SRC_PATH_TFTEXT=/opt/tensorflow-text @@ -56,6 +56,16 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip- RUN <<"EOF" bash -ex git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in +for pattern in \ + "s|@git+https://github.com/mlperf/logging.git||g" \ + "s|absl-py|absl-py==2.1.0|g" \ + "s|protobuf==3.20.3|protobuf>=3.19.0|g" \ + "s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \ + "s|@git+https://github.com/google/pathways-utils.git||g" \ + ; do + sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt; +done +echo "tensorflow-metadata>=1.15.0" >> ${SRC_PATH_MAXTEXT}/requirements.txt EOF ############################################################################### @@ -68,8 +78,8 @@ ADD test-maxtext.sh /usr/local/bin ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit AS final +FROM mealkit as final RUN pip-finalize.sh -WORKDIR ${SRC_PATH_MAXTEXT} +WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 deleted file mode 100644 index aab5089d6..000000000 --- a/.github/container/Dockerfile.maxtext.amd64 +++ /dev/null @@ -1,34 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG SRC_PATH_MAXTEXT=/opt/maxtext - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} AS mealkit -ARG URLREF_MAXTEXT -ARG SRC_PATH_MAXTEXT - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} -echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in -EOF - -############################################################################### -## Add test script to the path -############################################################################### - -ADD test-maxtext.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit AS final - -RUN pip-finalize.sh - -WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax similarity index 78% rename from .github/container/Dockerfile.pax.arm64 rename to .github/container/Dockerfile.pax index d862420e6..532498ab4 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax @@ -3,7 +3,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_PAXML=https://github.com/google/paxml.git#main ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis @@ -15,30 +15,30 @@ ARG SRC_PATH_LINGVO=/opt/lingvo ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} AS wheel-builder +FROM ${BASE_IMAGE} as wheel-builder #------------------------------------------------------------------------------ # build tensorflow-text from source #------------------------------------------------------------------------------ -FROM wheel-builder AS tftext-builder +FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} - + # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh + # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to # rely on this behavior. echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - + ./oss_scripts/run_build.sh EOF @@ -46,24 +46,25 @@ EOF # build lingvo #------------------------------------------------------------------------------ -FROM wheel-builder AS lingvo-builder +FROM wheel-builder as lingvo-builder ARG URLREF_LINGVO ARG SRC_PATH_TFTEXT ARG SRC_PATH_LINGVO - + # Preserve the version of tensorflow-text COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ - -RUN <<"EOF" bash -exu -o pipefail -git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} -EOF - + ENV USE_BAZEL_VERSION=7.1.2 + # build lingvo RUN <<"EOF" bash -exu -o pipefail +git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} pushd ${SRC_PATH_LINGVO} +CPU_ARCH="$(dpkg --print-architecture)" +if [[ "${CPU_ARCH}" == "arm64" ]]; then + # Use aarch distribution of protobufs patch -p1 <<"EOFINNER" diff --git a/lingvo/repo.bzl b/lingvo/repo.bzl @@ -84,13 +85,32 @@ index ce65822d2..d9c0277aa 100644 def icu(): EOFINNER -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl -sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt -sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt -sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt +fi + +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl +for pattern in \ + "s|tensorflow=|#tensorflow=|g" \ + "s|tensorflow-text=|#tensorflow-text=|g" \ + "s|dataclasses=|#dataclasses=|g" \ + "s|==.*||g" \ +; do + sed -i "${pattern}" ${SRC_PATH_LINGVO}/docker/dev.requirements.txt +done +for pattern in \ + "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \ + "s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \ + "s|python_requires='>=3.8,<3.11'|python_requires='>=3.8,<3.13'|" \ +; do + sed -i "${pattern}" ${SRC_PATH_LINGVO}/pip_package/setup.py; +done pip install -r docker/dev.requirements.txt # Some tests are flaky right now, so we skip running the tests. +BUILD_ARCH="x86_64" +if [[ "$CPU_ARCH" == "arm64" ]]; then + BUILD_ARCH="aarch64"; +fi +sed -i 's/manylinux2014_x86_64/manylinux_2_38_'"${BUILD_ARCH}"'/' pip_package/build.sh SKIP_TESTS=1 PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh EOF @@ -99,7 +119,7 @@ EOF ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} AS mealkit +FROM ${BASE_IMAGE} as mealkit ARG URLREF_PAXML ARG URLREF_PRAXIS ARG SRC_PATH_PAXML @@ -108,7 +128,7 @@ ARG SRC_PATH_TFTEXT # Preserve version information of tensorflow-text and lingvo COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ +COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*-linux*.whl /opt/ RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ @@ -116,7 +136,6 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip- # paxml + praxis RUN <<"EOF" bash -ex -echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in @@ -136,6 +155,8 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do "s|^scikit-learn|#scikit-learn|" \ "s|^protobuf|#protobuf|" \ "s|^numpy|#numpy|" \ + "s|^orbax-checkpoint|#orbax-checkpoint|" \ + "s| @ git+https://github.com/google/CommonLoopUtils||g" \ ; do sed -i "${pattern}" */pip_package/requirements.txt requirements.in done @@ -148,6 +169,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do fi popd done +sed -i 's/pysimdjson==[0-9.]*/pysimdjson/' ${SRC_PATH_PAXML}/setup.py EOF ADD test-pax.sh /usr/local/bin @@ -156,6 +178,6 @@ ADD test-pax.sh /usr/local/bin ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit AS final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 deleted file mode 100644 index 9a13fdb2c..000000000 --- a/.github/container/Dockerfile.pax.amd64 +++ /dev/null @@ -1,53 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_PAXML=https://github.com/google/paxml.git#main -ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG SRC_PATH_PAXML=/opt/paxml -ARG SRC_PATH_PRAXIS=/opt/praxis - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} AS mealkit -ARG URLREF_PAXML -ARG URLREF_PRAXIS -ARG SRC_PATH_PAXML -ARG SRC_PATH_PRAXIS - -# update TE manifest file to install the [test] extras -RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/requirements-te.in - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} -git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} -echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in -echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in - -for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do - pushd ${src} - sed -i "s| @ git+https://github.com/google/flax||g" requirements.in - sed -i "s| @ git+https://github.com/google/jax||g" requirements.in - ## we pin etils because newer etils versions are not compatible with the - ## version of TFDS required by Pax - sed -i "s/etils/etils==1.7.0/g" requirements.in - if git diff --quiet; then - echo "URL specs no longer present in select dependencies for ${src}" - exit 1 - else - git commit -a -m "remove URL specs from select dependencies for ${src}" - fi - popd -done -EOF - -ADD test-pax.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit AS final - -RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x.arm64 b/.github/container/Dockerfile.t5x similarity index 81% rename from .github/container/Dockerfile.t5x.arm64 rename to .github/container/Dockerfile.t5x index adabe248c..283a8544a 100644 --- a/.github/container/Dockerfile.t5x.arm64 +++ b/.github/container/Dockerfile.t5x @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1-labs # Example command to build manually: -# docker buildx build -f Dockerfile.t5x.arm64 --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . +# docker buildx build -f Dockerfile.t5x --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 @@ -15,20 +15,20 @@ ARG SRC_PATH_T5X=/opt/t5x #------------------------------------------------------------------------------ # build tensorflow-text from source #------------------------------------------------------------------------------ - -FROM ${BASE_IMAGE} AS tftext-builder +FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT + +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 +RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} - + # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh + # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to # rely on this behavior. @@ -67,14 +67,15 @@ pushd ${SRC_PATH_T5X} sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py # for ARM64 build -sed -i "s/'tensorflow/#'tensorflow/" setup.py - -sed -i "s/f'jax/#f'jax/" setup.py -sed -i "s/'tpu/#'tpu/" setup.py +if [[ "$(dpkg --print-architecture)" == "arm64" ]]; then + sed -i "s/'tensorflow/#'tensorflow/" setup.py -sed -i "s/'protobuf/#'protobuf/" setup.py -sed -i "s/'numpy/#'numpy/" setup.py + sed -i "s/f'jax/#f'jax/" setup.py + sed -i "s/'tpu/#'tpu/" setup.py + sed -i "s/'protobuf/#'protobuf/" setup.py + sed -i "s/'numpy/#'numpy/" setup.py +fi if git diff --quiet; then echo "URL specs no longer present in select dependencies of t5x" exit 1 diff --git a/.github/container/Dockerfile.t5x.amd64 b/.github/container/Dockerfile.t5x.amd64 deleted file mode 100644 index 92597c60a..000000000 --- a/.github/container/Dockerfile.t5x.amd64 +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_T5X=https://github.com/google-research/t5x.git#main -ARG SRC_PATH_T5X=/opt/t5x - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} AS mealkit -ARG URLREF_T5X -ARG SRC_PATH_T5X - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_T5X} ${SRC_PATH_T5X} -echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in -# This is required because pip can sometimes try to pull every version of seqio-nightly during -# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work -# so setting the lower-bound to something recent -echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in - -# remove head-of-tree specs from select dependencies -pushd ${SRC_PATH_T5X} -sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py -if git diff --quiet; then - echo "URL specs no longer present in select dependencies of t5x" - exit 1 -else - git commit -a -m "remove URL specs from select dependencies of t5x" -fi -popd -EOF - -ADD test-t5x.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit AS final - -RUN pip-finalize.sh diff --git a/.github/workflows/_build.yaml b/.github/workflows/_build.yaml index 77d1f6469..83cd2a772 100644 --- a/.github/workflows/_build.yaml +++ b/.github/workflows/_build.yaml @@ -31,7 +31,7 @@ on: required: true DOCKERFILE: type: string - description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x.amd64" + description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x" required: true DOCKER_CONTEXT: type: string diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 2ebe18d0e..7632c5f8e 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -107,7 +107,7 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.maxtext EXTRA_BUILD_ARGS: | URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} secrets: inherit @@ -138,7 +138,7 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.t5x EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} secrets: inherit @@ -153,7 +153,7 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-pax - DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.pax EXTRA_BUILD_ARGS: | URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} diff --git a/README.md b/README.md index 538589e5f..78835517b 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ We support and test the following JAX frameworks and model architectures. More d - + @@ -227,7 +227,7 @@ We support and test the following JAX frameworks and model architectures. More d - + @@ -275,7 +275,7 @@ We support and test the following JAX frameworks and model architectures. More d - + From 3f4efa5f3038d39fe80b00d880293c1ec9f0ff53 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 22 Nov 2024 12:29:21 -0800 Subject: [PATCH 03/20] Fix build for pax, t5x, gemma --- .github/container/Dockerfile.jax | 1 - .github/container/Dockerfile.pax | 1 + .github/container/Dockerfile.t5x | 5 ++++- rosetta/Dockerfile.gemma | 37 +++++++++++++++++++++++++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 95695dfd1..9423d6c18 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -36,7 +36,6 @@ RUN --mount=type=ssh \ --mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \ <<"EOF" bash -ex git-clone.sh ${URLREF_JAX} ${SRC_PATH_JAX} - sed -i 's/^numpy.*/numpy<2.0.0/' ${SRC_PATH_JAX}/build/requirements.in git-clone.sh ${URLREF_XLA} ${SRC_PATH_XLA} EOF diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 532498ab4..41c750062 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -150,6 +150,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do for pattern in \ "s| @ git+https://github.com/google/flax||g" \ "s| @ git+https://github.com/google/jax||g" \ + "s| @ git+https://github.com/google/fiddle||g" \ "s|^tensorflow|#tensorflow|" \ "s|^lingvo|#lingvo|" \ "s|^scikit-learn|#scikit-learn|" \ diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 283a8544a..eb9112bde 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -12,6 +12,9 @@ ARG SRC_PATH_T5X=/opt/t5x ## build several packages which do not have working arm64 pip wheels ############################################################################### +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + #------------------------------------------------------------------------------ # build tensorflow-text from source #------------------------------------------------------------------------------ @@ -39,7 +42,7 @@ EOF ############################################################################### -## T5X for AArch64 +## T5X ############################################################################### FROM ${BASE_IMAGE} AS mealkit diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index 1efc4e719..d9f4e4d27 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -12,6 +12,38 @@ ARG SRC_PATH_FLAXFORMER=/opt/flaxformer ARG URLREF_PANOPTICAPI=https://github.com/akolesnikoff/panopticapi.git#mute ARG SRC_PATH_PANOPTICAPI=/opt/panopticapi +############################################################################### +## Build several packages which do not have working amd64/arm64 pip wheels +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + +#------------------------------------------------------------------------------ +# build tensorflow-text from source +#------------------------------------------------------------------------------ +FROM wheel-builder as tftext-builder +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT + +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 +RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} +RUN <<"EOF" bash -exu -o pipefail +cd ${SRC_PATH_TFTEXT} + +# The tftext build script queries GitHub, but these requests are sometimes +# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. +# A workaround (needs to be updated when the tensorflow version changes): +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh + +# Newer versions of LLVM make lld's --undefined-version check of lld is strict +# by default (https://reviews.llvm.org/D135402), but the tftext build seems to +# rely on this behavior. +echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh + +./oss_scripts/run_build.sh +EOF + ############################################################################### ## Download source and add auxiliary scripts ############################################################################### @@ -29,6 +61,9 @@ ARG URLREF_PANOPTICAPI ARG SRC_PATH_PANOPTICAPI +COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ + RUN <<"EOF" bash -ex git-clone.sh ${URLREF_GEMMA} ${SRC_PATH_GEMMA} git-clone.sh ${URLREF_BIG_VISION} ${SRC_PATH_BIG_VISION} @@ -53,7 +88,7 @@ optax protobuf tfds-nightly tensorflow -tensorflow-text +tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl) tensorflow-gan " >> /opt/pip-tools.d/requirements-gemma.in EOF From b2eab6528f156fb59c0e29437dd61f9fbec2e67e Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 22 Nov 2024 13:25:00 -0800 Subject: [PATCH 04/20] Use master branch of TF-text --- .github/container/Dockerfile.maxtext | 2 +- .github/container/Dockerfile.t5x | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index be1324e6d..4369c2662 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -45,7 +45,7 @@ EOF FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.18.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT ARG SRC_PATH_TFTEXT=/opt/tensorflow-text diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index eb9112bde..ced296439 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -3,7 +3,7 @@ # docker buildx build -f Dockerfile.t5x --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_T5X=https://github.com/google-research/t5x.git#main ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_T5X=/opt/t5x From 71ad68bd5c5c5dca1e0e56fe57b3470bdd444d67 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 22 Nov 2024 15:53:45 -0800 Subject: [PATCH 05/20] Fix gemma TF-text urls --- rosetta/Dockerfile.gemma | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index d9f4e4d27..39d4c9548 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -11,6 +11,8 @@ ARG URLREF_FLAXFORMER=https://github.com/google/flaxformer.git#main ARG SRC_PATH_FLAXFORMER=/opt/flaxformer ARG URLREF_PANOPTICAPI=https://github.com/akolesnikoff/panopticapi.git#mute ARG SRC_PATH_PANOPTICAPI=/opt/panopticapi +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ############################################################################### ## Build several packages which do not have working amd64/arm64 pip wheels From 0b452c4239ec26c075a42033ebb80c816cbf2a70 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 25 Nov 2024 14:00:38 -0800 Subject: [PATCH 06/20] Fix T5x build --- .github/container/Dockerfile.base | 2 +- .github/container/Dockerfile.t5x | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index d0bdab868..4e8901524 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -130,7 +130,7 @@ git add -u git commit -m 'Adds JAX_TOOLBOX_VCS_EQUIVALENCY as a trigger to treat all github VCS installs for a package as equivalent. The spec of the last encountered version will be used' EOF # Create a system-wide venv for the pip-installed world inside the containers -RUN python -m venv --prompt jax /opt/venv && /opt/venv/bin/pip install --ignore-installed --no-cache-dir -e /opt/pip pip-tools +RUN python -m venv --prompt jax /opt/venv && /opt/venv/bin/pip install --no-cache-dir -e /opt/pip pip-tools # Make sure `python` refers to the venv version ENV PATH=/opt/venv/bin:${PATH} diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index ced296439..dec39fce8 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -85,6 +85,8 @@ if git diff --quiet; then else git commit -a -m "remove URL specs from select dependencies of t5x" fi +sed -i 's/pysimdjson==[0-9.]*/pysimdjson/' setup.py +sed -i 's/fasttext==[0-9.]*/fasttext/' setup.py popd EOF From 62e7ed771ec750e2f12a81391b9775eac705a675 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 26 Nov 2024 13:20:25 -0800 Subject: [PATCH 07/20] Address comments --- .github/container/Dockerfile.equinox | 4 ++-- .github/container/Dockerfile.levanter | 4 ++-- .github/container/Dockerfile.maxtext | 1 + .github/container/Dockerfile.mjx | 4 ++-- .github/container/Dockerfile.pax | 14 +++++++++----- .github/container/manifest.yaml | 4 ++-- .github/workflows/_ci.yaml | 17 +++++++++++------ 7 files changed, 29 insertions(+), 19 deletions(-) diff --git a/.github/container/Dockerfile.equinox b/.github/container/Dockerfile.equinox index 804cbeb16..264b11d20 100644 --- a/.github/container/Dockerfile.equinox +++ b/.github/container/Dockerfile.equinox @@ -8,7 +8,7 @@ ARG SRC_PATH_EQUINOX=/opt/equinox ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} AS mealkit +FROM ${BASE_IMAGE} as mealkit ARG URLREF_EQUINOX ARG SRC_PATH_EQUINOX @@ -22,6 +22,6 @@ EOF ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit AS final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.levanter b/.github/container/Dockerfile.levanter index 413c81f4c..90b31ec04 100644 --- a/.github/container/Dockerfile.levanter +++ b/.github/container/Dockerfile.levanter @@ -10,7 +10,7 @@ ARG SRC_PATH_HALIAX=/opt/haliax ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} AS mealkit +FROM ${BASE_IMAGE} as mealkit ARG URLREF_LEVANTER ARG URLREF_HALIAX ARG SRC_PATH_LEVANTER @@ -34,6 +34,6 @@ COPY levanter-cache-warn.sh /opt/nvidia/entrypoint.d/ ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit AS final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 4369c2662..a38c46240 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -17,6 +17,7 @@ FROM ${BASE_IMAGE} as wheel-builder # build tensorflow-text from source #------------------------------------------------------------------------------ +# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT diff --git a/.github/container/Dockerfile.mjx b/.github/container/Dockerfile.mjx index 510e90a77..1bdd1a439 100644 --- a/.github/container/Dockerfile.mjx +++ b/.github/container/Dockerfile.mjx @@ -12,7 +12,7 @@ ARG SRC_PATH_L2R=/opt/language-to-reward-2023 ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} AS mealkit +FROM ${BASE_IMAGE} as mealkit ARG URLREF_MUJOCO ARG URLREF_MUJOCO_MPC ARG URLREF_L2R @@ -49,6 +49,6 @@ EOF ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM mealkit AS final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 41c750062..68afc42ea 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -21,6 +21,7 @@ FROM ${BASE_IMAGE} as wheel-builder # build tensorflow-text from source #------------------------------------------------------------------------------ +# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT @@ -28,17 +29,17 @@ RUN <<"EOF" bash -exu -o pipefail pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} - + # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh - + # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to # rely on this behavior. echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - + ./oss_scripts/run_build.sh EOF @@ -46,15 +47,16 @@ EOF # build lingvo #------------------------------------------------------------------------------ +# Remove Lingvo build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as lingvo-builder ARG URLREF_LINGVO ARG SRC_PATH_TFTEXT ARG SRC_PATH_LINGVO - + # Preserve the version of tensorflow-text COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ - + ENV USE_BAZEL_VERSION=7.1.2 # build lingvo @@ -96,6 +98,8 @@ for pattern in \ ; do sed -i "${pattern}" ${SRC_PATH_LINGVO}/docker/dev.requirements.txt done +# Lingvo support only python < 3.12, so we hack it and update dependencies +# to be able to build for py-3.12 for pattern in \ "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \ "s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \ diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 437ca93c6..5ccf5fe75 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -62,8 +62,8 @@ lingvo: tensorflow-text: # Used only in ARM pax and t5x builds url: https://github.com/tensorflow/text.git - tracking_ref: v2.13.0 - latest_verified_commit: 917a681d7220ebf9b62a08b6f9ce7b7db886ddef + tracking_ref: master + latest_verified_commit: 1779b3ae16f7bd287c4edcf66d62208dc63256f3 mode: git-clone pydantic: version: X.Y.Z diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 7632c5f8e..6cef657db 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -79,7 +79,8 @@ jobs: CONTAINER_NAME: triton DOCKERFILE: .github/container/Dockerfile.triton RUNNER_SIZE: large - EXTRA_BUILD_ARGS: URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} secrets: inherit build-equinox: @@ -110,6 +111,7 @@ jobs: DOCKERFILE: .github/container/Dockerfile.maxtext EXTRA_BUILD_ARGS: | URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} secrets: inherit build-levanter: @@ -141,6 +143,7 @@ jobs: DOCKERFILE: .github/container/Dockerfile.t5x EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} secrets: inherit build-upstream-pax: @@ -157,6 +160,8 @@ jobs: EXTRA_BUILD_ARGS: | URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} + URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} secrets: inherit build-rosetta-t5x: @@ -193,11 +198,11 @@ jobs: DOCKERFILE: rosetta/Dockerfile.gemma DOCKER_CONTEXT: . EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).gemma }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).big_vision }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).common_loop_utils }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).flaxformer }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).panopticapi }} + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} secrets: inherit collect-docker-tags: From beb4f8292f7d923c126d6548ff9a33409a6d0e6c Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 26 Nov 2024 16:04:06 -0800 Subject: [PATCH 08/20] Fix gemma build --- rosetta/Dockerfile.gemma | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index 39d4c9548..e7db16dcc 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -28,9 +28,9 @@ FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} RUN <<"EOF" bash -exu -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} # The tftext build script queries GitHub, but these requests are sometimes @@ -50,6 +50,7 @@ EOF ## Download source and add auxiliary scripts ############################################################################### +ARG BASE_IMAGE FROM ${BASE_IMAGE} as mealkit ARG URLREF_GEMMA ARG SRC_PATH_GEMMA @@ -61,7 +62,8 @@ ARG URLREF_FLAXFORMER ARG SRC_PATH_FLAXFORMER ARG URLREF_PANOPTICAPI ARG SRC_PATH_PANOPTICAPI - +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ @@ -72,11 +74,12 @@ git-clone.sh ${URLREF_BIG_VISION} ${SRC_PATH_BIG_VISION} git-clone.sh ${URLREF_COMMON_LOOP_UTILS} ${SRC_PATH_COMMON_LOOP_UTILS} git-clone.sh ${URLREF_FLAXFORMER} ${SRC_PATH_FLAXFORMER} git-clone.sh ${URLREF_PANOPTICAPI} ${SRC_PATH_PANOPTICAPI} -echo "-e file://${SRC_PATH_GEMMA}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_COMMON_LOOP_UTILS}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_FLAXFORMER}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_PANOPTICAPI}" >> /opt/pip-tools.d/requirements-gemma.in -echo "ipython==8.2 +echo " +-e file://${SRC_PATH_GEMMA} +-e file://${SRC_PATH_COMMON_LOOP_UTILS} +-e file://${SRC_PATH_FLAXFORMER} +-e file://${SRC_PATH_PANOPTICAPI} +ipython==8.2 jupyterlab gcloud overrides @@ -95,7 +98,7 @@ tensorflow-gan " >> /opt/pip-tools.d/requirements-gemma.in EOF -ENV PYTHONPATH "${SRC_PATH_BIG_VISION}:${PYTHONPATH}" +ENV PYTHONPATH="${SRC_PATH_BIG_VISION}:${PYTHONPATH}" ADD ./rosetta/rosetta/projects/paligemma/Finetune_PaliGemma.ipynb ${SRC_PATH_GEMMA}/examples/Finetune_PaliGemma.ipynb ADD ./rosetta/rosetta/projects/paligemma/test_gemma.py ${SRC_PATH_GEMMA}/tests/test_gemma.py From 3c2ec9706fb09f8ebf7e4c43bd0f00aa6626d5f9 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 27 Nov 2024 00:02:09 -0800 Subject: [PATCH 09/20] Clone airio --- .github/container/Dockerfile.t5x | 17 +++++++++++++++-- .github/container/manifest.yaml | 4 ++-- .github/workflows/_ci.yaml | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index dec39fce8..1d6494f09 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -5,8 +5,11 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_T5X=https://github.com/google-research/t5x.git#main +ARG URLREF_AIRIO=https://github.com/google/airio.git#main ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_T5X=/opt/t5x +ARG SRC_PATH_AIRIO=/opt/airio + ############################################################################### ## build several packages which do not have working arm64 pip wheels @@ -23,8 +26,8 @@ ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 -RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} RUN <<"EOF" bash -exu -o pipefail +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} # The tftext build script queries GitHub, but these requests are sometimes @@ -45,14 +48,16 @@ EOF ## T5X ############################################################################### +ARG BASE_IMAGE FROM ${BASE_IMAGE} AS mealkit ARG URLREF_T5X +ARG URLREF_AIRIO ARG SRC_PATH_TFTEXT ARG SRC_PATH_T5X +ARG SRC_PATH_AIRIO # Preserve version information of tensorflow-text COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml - COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in @@ -78,6 +83,14 @@ if [[ "$(dpkg --print-architecture)" == "arm64" ]]; then sed -i "s/'protobuf/#'protobuf/" setup.py sed -i "s/'numpy/#'numpy/" setup.py + + + # airio pins grain==0.2.0, but the later does not have arm64 wheel. + # Need to bump grain to 0.2.2 to resolve the issue (https://github.com/google/airio/issues/257) + git-clone.sh ${URLREF_AIRIO} ${SRC_PATH_AIRIO} + sed -i "s/grain==0.2.0/grain/g" ${SRC_PATH_AIRIO}/setup.py + sed -i "s/'airio/#'airio/g" setup.py + echo "-e file://${SRC_PATH_AIRIO}" >> /opt/pip-tools.d/requirements-t5x.in fi if git diff --quiet; then echo "URL specs no longer present in select dependencies of t5x" diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 5ccf5fe75..b9c06e2e6 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -78,8 +78,8 @@ fiddle: airio: url: https://github.com/google/airio.git tracking_ref: main - latest_verified_commit: cfca4a10de1491d76d2d00fcbd7142079837ca99 - mode: pip-vcs + latest_verified_commit: 37109ff0d1059f885b9b11ef9058eca5d219d7cb + mode: git-clone clu: url: https://github.com/google/CommonLoopUtils.git tracking_ref: main diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 6cef657db..2f77c07f4 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -144,6 +144,7 @@ jobs: EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} secrets: inherit build-upstream-pax: From 173ddc5a2b90f1e77ee0960f871751f2e7e402d3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 27 Nov 2024 13:58:49 -0800 Subject: [PATCH 10/20] Update maxtext docker --- .github/container/Dockerfile.maxtext | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index a38c46240..17bc6f998 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -57,12 +57,13 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip- RUN <<"EOF" bash -ex git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in + +# specify some restrictions to speed up the build and +# avoid pip to download and check all available versions of packages for pattern in \ - "s|@git+https://github.com/mlperf/logging.git||g" \ - "s|absl-py|absl-py==2.1.0|g" \ + "s|absl-py|absl-py>=2.1.0|g" \ "s|protobuf==3.20.3|protobuf>=3.19.0|g" \ "s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \ - "s|@git+https://github.com/google/pathways-utils.git||g" \ ; do sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt; done From 92996e3cb3909a2adae725dd98e63ff794e897f2 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 2 Dec 2024 12:20:44 -0700 Subject: [PATCH 11/20] Uninstall several packages and add PIP_BREAK_SYSTEM_PACKAGES=1 env var --- .github/container/Dockerfile.base | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 4e8901524..0653e7dd7 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -53,7 +53,6 @@ apt_packages=( liblzma-dev python-is-python3 python3-pip - python3-venv rsync vim wget @@ -72,6 +71,18 @@ if [[ $(dpkg --print-architecture) == arm64 ]]; then fi apt-get install -y ${apt_packages[@]} +# There are several python packages (in the list below) that are installed with OS +# package manager (the run of `apt-get install` above) and can not be uninstall +# using pip (in pip-finalize.sh script) during JAX installation. Remove then in +# advance to avoid JAX installation issue. +uninstall_packages=( + python3-gobject + python3-yaml + python3-markdown + python3-pygments +) +apt-get remove -y ${uninstall_packages[@]} + # Install LLVM/Clang bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION} apt-get remove -y software-properties-common lsb-release @@ -129,10 +140,10 @@ git apply Date: Mon, 2 Dec 2024 16:20:37 -0700 Subject: [PATCH 12/20] Uninstall several packages and add PIP_BREAK_SYSTEM_PACKAGES=1 env var --- .github/container/Dockerfile.base | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 0653e7dd7..ce68b45d5 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -76,10 +76,10 @@ apt-get install -y ${apt_packages[@]} # using pip (in pip-finalize.sh script) during JAX installation. Remove then in # advance to avoid JAX installation issue. uninstall_packages=( - python3-gobject - python3-yaml - python3-markdown - python3-pygments + python-gobject + python-yaml + python-markdown + python-pygments ) apt-get remove -y ${uninstall_packages[@]} From 8c10287c635e5ec888b72f52e0ea64be28bbb6ac Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 2 Dec 2024 16:52:08 -0700 Subject: [PATCH 13/20] Edit remove packages list --- .github/container/Dockerfile.base | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index ce68b45d5..abc2c8828 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -75,13 +75,10 @@ apt-get install -y ${apt_packages[@]} # package manager (the run of `apt-get install` above) and can not be uninstall # using pip (in pip-finalize.sh script) during JAX installation. Remove then in # advance to avoid JAX installation issue. -uninstall_packages=( - python-gobject - python-yaml - python-markdown - python-pygments +remove_packages=( + python3-gi ) -apt-get remove -y ${uninstall_packages[@]} +apt-get remove -y ${remove_packages[@]} # Install LLVM/Clang bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION} From c75c825a2c20aefdcfcc518c557376c4facf0542 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 2 Dec 2024 17:02:56 -0700 Subject: [PATCH 14/20] Edit remove packages list --- .github/container/Dockerfile.base | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index abc2c8828..4ce359a61 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -71,19 +71,8 @@ if [[ $(dpkg --print-architecture) == arm64 ]]; then fi apt-get install -y ${apt_packages[@]} -# There are several python packages (in the list below) that are installed with OS -# package manager (the run of `apt-get install` above) and can not be uninstall -# using pip (in pip-finalize.sh script) during JAX installation. Remove then in -# advance to avoid JAX installation issue. -remove_packages=( - python3-gi -) -apt-get remove -y ${remove_packages[@]} - # Install LLVM/Clang bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION} -apt-get remove -y software-properties-common lsb-release -apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX # Make sure that clang and clang++ point to the new version. This list is based # on the symlinks installed by the `clang` (as opposed to `clang-14`) and `lld` @@ -114,6 +103,18 @@ EOL apt-get clean rm -rf /var/lib/apt/lists/* + +# There are several python packages (in the list below) that are installed with OS +# package manager (the run of `apt-get install` above) and can not be uninstall +# using pip (in pip-finalize.sh script) during JAX installation. Remove then in +# advance to avoid JAX installation issue. +remove_packages=( + python3-gi + software-properties-common + lsb-release +) +apt-get remove -y ${remove_packages[@]} +apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX EOF RUN <<"EOF" bash -ex From 8468c9f9b0828a1e43bf660745261816a80bfd93 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 3 Dec 2024 00:11:32 -0700 Subject: [PATCH 15/20] Edit remove packages list --- .github/container/Dockerfile.base | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 4ce359a61..7e5861e1c 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -112,7 +112,10 @@ remove_packages=( python3-gi software-properties-common lsb-release + python3-yaml + python3-pygments ) + apt-get remove -y ${remove_packages[@]} apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX EOF From 008b3fc4f4b3c5846ca9d23a3f636d0f92f542a3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 3 Dec 2024 10:09:49 -0700 Subject: [PATCH 16/20] [skip ci] Resurect amd64/arm64 dockerfiles --- .github/container/Dockerfile.maxtext.amd64 | 34 +++++ .github/container/Dockerfile.maxtext.arm64 | 75 ++++++++++ .github/container/Dockerfile.pax.amd64 | 53 +++++++ .github/container/Dockerfile.pax.arm64 | 161 +++++++++++++++++++++ .github/container/Dockerfile.t5x.amd64 | 43 ++++++ .github/container/Dockerfile.t5x.arm64 | 95 ++++++++++++ 6 files changed, 461 insertions(+) create mode 100644 .github/container/Dockerfile.maxtext.amd64 create mode 100644 .github/container/Dockerfile.maxtext.arm64 create mode 100644 .github/container/Dockerfile.pax.amd64 create mode 100644 .github/container/Dockerfile.pax.arm64 create mode 100644 .github/container/Dockerfile.t5x.amd64 create mode 100644 .github/container/Dockerfile.t5x.arm64 diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 new file mode 100644 index 000000000..8289a6099 --- /dev/null +++ b/.github/container/Dockerfile.maxtext.amd64 @@ -0,0 +1,34 @@ +# syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main +ARG SRC_PATH_MAXTEXT=/opt/maxtext + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_MAXTEXT +ARG SRC_PATH_MAXTEXT + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} +echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in +EOF + +############################################################################### +## Add test script to the path +############################################################################### + +ADD test-maxtext.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh + +WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file diff --git a/.github/container/Dockerfile.maxtext.arm64 b/.github/container/Dockerfile.maxtext.arm64 new file mode 100644 index 000000000..90713afb6 --- /dev/null +++ b/.github/container/Dockerfile.maxtext.arm64 @@ -0,0 +1,75 @@ +# syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG SRC_PATH_MAXTEXT=/opt/maxtext +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text + +############################################################################### +## build tensorflow-text and lingvo, which do not have working arm64 pip wheels +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + +#------------------------------------------------------------------------------ +# build tensorflow-text from source +#------------------------------------------------------------------------------ + +FROM wheel-builder as tftext-builder +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT +RUN <<"EOF" bash -exu -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} +cd ${SRC_PATH_TFTEXT} + +# The tftext build script queries GitHub, but these requests are sometimes +# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. +# A workaround (needs to be updated when the tensorflow version changes): +sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh + +# Newer versions of LLVM make lld's --undefined-version check of lld is strict +# by default (https://reviews.llvm.org/D135402), but the tftext build seems to +# rely on this behavior. +echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh + +./oss_scripts/run_build.sh +EOF + + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_MAXTEXT +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG SRC_PATH_MAXTEXT +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text + +# Preserve version information of tensorflow-text +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ +RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-maxtext.in + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} +echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in +EOF + +############################################################################### +## Add test script to the path +############################################################################### + +ADD test-maxtext.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh + +WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 new file mode 100644 index 000000000..2076b8f1d --- /dev/null +++ b/.github/container/Dockerfile.pax.amd64 @@ -0,0 +1,53 @@ +# syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_PAXML=https://github.com/google/paxml.git#main +ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_PAXML +ARG URLREF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + +# update TE manifest file to install the [test] extras +RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/requirements-te.in + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} +git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in + +for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do + pushd ${src} + sed -i "s| @ git+https://github.com/google/flax||g" requirements.in + sed -i "s| @ git+https://github.com/google/jax||g" requirements.in + ## we pin etils because newer etils versions are not compatible with the + ## version of TFDS required by Pax + sed -i "s/etils/etils==1.7.0/g" requirements.in + if git diff --quiet; then + echo "URL specs no longer present in select dependencies for ${src}" + exit 1 + else + git commit -a -m "remove URL specs from select dependencies for ${src}" + fi + popd +done +EOF + +ADD test-pax.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh \ No newline at end of file diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 new file mode 100644 index 000000000..b39344c2c --- /dev/null +++ b/.github/container/Dockerfile.pax.arm64 @@ -0,0 +1,161 @@ +# syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_PAXML=https://github.com/google/paxml.git#main +ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text +ARG SRC_PATH_LINGVO=/opt/lingvo + +############################################################################### +## build tensorflow-text and lingvo, which do not have working arm64 pip wheels +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + +#------------------------------------------------------------------------------ +# build tensorflow-text from source +#------------------------------------------------------------------------------ + +FROM wheel-builder as tftext-builder +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT +RUN <<"EOF" bash -exu -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} +cd ${SRC_PATH_TFTEXT} + +# The tftext build script queries GitHub, but these requests are sometimes +# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. +# A workaround (needs to be updated when the tensorflow version changes): +sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh + +# Newer versions of LLVM make lld's --undefined-version check of lld is strict +# by default (https://reviews.llvm.org/D135402), but the tftext build seems to +# rely on this behavior. +echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh + +./oss_scripts/run_build.sh +EOF + +#------------------------------------------------------------------------------ +# build lingvo +#------------------------------------------------------------------------------ + +FROM wheel-builder as lingvo-builder +ARG URLREF_LINGVO +ARG SRC_PATH_TFTEXT +ARG SRC_PATH_LINGVO + +# Preserve the version of tensorflow-text +COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ + +RUN <<"EOF" bash -exu -o pipefail +git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} +EOF + +ENV USE_BAZEL_VERSION=7.1.2 +# build lingvo +RUN <<"EOF" bash -exu -o pipefail +pushd ${SRC_PATH_LINGVO} + +# Use aarch distribution of protobufs +patch -p1 <<"EOFINNER" +diff --git a/lingvo/repo.bzl b/lingvo/repo.bzl +index ce65822d2..d9c0277aa 100644 +--- a/lingvo/repo.bzl ++++ b/lingvo/repo.bzl +@@ -232,9 +232,9 @@ filegroup( + ) + """, + urls = [ +- "https://github.com/protocolbuffers/protobuf/releases/download/v21.9/protoc-21.9-linux-x86_64.zip", ++ "https://github.com/protocolbuffers/protobuf/releases/download/v21.9/protoc-21.9-linux-aarch_64.zip", + ], +- sha256 = "3cd951aff8ce713b94cde55e12378f505f2b89d47bf080508cf77e3934f680b6", ++ sha256 = "a584286dfa8ebb17032ece206ed74d5e9931e2edb9016e427be2a0dab3b21071", + ) + + def icu(): +EOFINNER + +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl +sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt +sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt +sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt +pip install -r docker/dev.requirements.txt + +# Some tests are flaky right now, so we skip running the tests. +SKIP_TESTS=1 PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh +EOF + +############################################################################### +## Pax for AArch64 +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_PAXML +ARG URLREF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS +ARG SRC_PATH_TFTEXT + +# Preserve version information of tensorflow-text and lingvo +COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml +COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ +RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in + +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ +RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in + +# paxml + praxis +RUN <<"EOF" bash -ex +echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in +echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in +echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in + +git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} +git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in + +for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do + pushd ${src} + + for pattern in \ + "s| @ git+https://github.com/google/flax||g" \ + "s| @ git+https://github.com/google/jax||g" \ + "s|^tensorflow|#tensorflow|" \ + "s|^lingvo|#lingvo|" \ + "s|^scikit-learn|#scikit-learn|" \ + "s|^protobuf|#protobuf|" \ + "s|^numpy|#numpy|" \ + ; do + sed -i "${pattern}" */pip_package/requirements.txt requirements.in + done + + if git diff --quiet; then + echo "broken dependencies no longer present in ${src}" + exit 1 + else + git commit -a -m "remove broken dependencies from ${src}" + fi + popd +done +EOF + +ADD test-pax.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh \ No newline at end of file diff --git a/.github/container/Dockerfile.t5x.amd64 b/.github/container/Dockerfile.t5x.amd64 new file mode 100644 index 000000000..760f641b2 --- /dev/null +++ b/.github/container/Dockerfile.t5x.amd64 @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_T5X=https://github.com/google-research/t5x.git#main +ARG SRC_PATH_T5X=/opt/t5x + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_T5X +ARG SRC_PATH_T5X + +RUN <<"EOF" bash -ex +git-clone.sh ${URLREF_T5X} ${SRC_PATH_T5X} +echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in +# This is required because pip can sometimes try to pull every version of seqio-nightly during +# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work +# so setting the lower-bound to something recent +echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in + +# remove head-of-tree specs from select dependencies +pushd ${SRC_PATH_T5X} +sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py +if git diff --quiet; then + echo "URL specs no longer present in select dependencies of t5x" + exit 1 +else + git commit -a -m "remove URL specs from select dependencies of t5x" +fi +popd +EOF + +ADD test-t5x.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh \ No newline at end of file diff --git a/.github/container/Dockerfile.t5x.arm64 b/.github/container/Dockerfile.t5x.arm64 new file mode 100644 index 000000000..35ed69412 --- /dev/null +++ b/.github/container/Dockerfile.t5x.arm64 @@ -0,0 +1,95 @@ +# syntax=docker/dockerfile:1-labs +# Example command to build manually: +# docker buildx build -f Dockerfile.t5x.arm64 --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . + +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_T5X=https://github.com/google-research/t5x.git#main +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text +ARG SRC_PATH_T5X=/opt/t5x + +############################################################################### +## build several packages which do not have working arm64 pip wheels +############################################################################### + +#------------------------------------------------------------------------------ +# build tensorflow-text from source +#------------------------------------------------------------------------------ + +FROM ${BASE_IMAGE} as tftext-builder +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT +RUN <<"EOF" bash -exu -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} +cd ${SRC_PATH_TFTEXT} + +# The tftext build script queries GitHub, but these requests are sometimes +# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. +# A workaround (needs to be updated when the tensorflow version changes): +sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh + +# Newer versions of LLVM make lld's --undefined-version check of lld is strict +# by default (https://reviews.llvm.org/D135402), but the tftext build seems to +# rely on this behavior. +echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh + +./oss_scripts/run_build.sh +EOF + + +############################################################################### +## T5X for AArch64 +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG URLREF_T5X +ARG SRC_PATH_TFTEXT +ARG SRC_PATH_T5X + +# Preserve version information of tensorflow-text +COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml + +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ +RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in + +RUN <<"EOF" bash -ex +# 1. Fetch T5X +git-clone.sh "${URLREF_T5X}" "${SRC_PATH_T5X}" +echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in +# This is required because pip can sometimes try to pull every version of seqio-nightly during +# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work +# so setting the lower-bound to something recent +echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in + +# 2. Remove head-of-tree specs from select dependencies +pushd ${SRC_PATH_T5X} +sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py + +# for ARM64 build +sed -i "s/'tensorflow/#'tensorflow/" setup.py + +sed -i "s/f'jax/#f'jax/" setup.py +sed -i "s/'tpu/#'tpu/" setup.py + +sed -i "s/'protobuf/#'protobuf/" setup.py +sed -i "s/'numpy/#'numpy/" setup.py + +if git diff --quiet; then + echo "URL specs no longer present in select dependencies of t5x" + exit 1 +else + git commit -a -m "remove URL specs from select dependencies of t5x" +fi +popd +EOF + +ADD test-t5x.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous[] stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh \ No newline at end of file From d633578f7ad51a3753921e3a29233581e5b7a706 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 3 Dec 2024 10:12:05 -0700 Subject: [PATCH 17/20] [skip ci] Resurect amd64/arm64 dockerfiles: fix whitespace error --- .github/container/Dockerfile.maxtext | 2 +- .github/container/Dockerfile.maxtext.amd64 | 2 +- .github/container/Dockerfile.maxtext.arm64 | 2 +- .github/container/Dockerfile.t5x.amd64 | 2 +- .github/container/Dockerfile.t5x.arm64 | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 17bc6f998..44164a60f 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -84,4 +84,4 @@ FROM mealkit as final RUN pip-finalize.sh -WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file +WORKDIR ${SRC_PATH_MAXTEXT} diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 index 8289a6099..9d08cc360 100644 --- a/.github/container/Dockerfile.maxtext.amd64 +++ b/.github/container/Dockerfile.maxtext.amd64 @@ -31,4 +31,4 @@ FROM mealkit as final RUN pip-finalize.sh -WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file +WORKDIR ${SRC_PATH_MAXTEXT} diff --git a/.github/container/Dockerfile.maxtext.arm64 b/.github/container/Dockerfile.maxtext.arm64 index 90713afb6..bd64c5b6d 100644 --- a/.github/container/Dockerfile.maxtext.arm64 +++ b/.github/container/Dockerfile.maxtext.arm64 @@ -72,4 +72,4 @@ FROM mealkit as final RUN pip-finalize.sh -WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file +WORKDIR ${SRC_PATH_MAXTEXT} diff --git a/.github/container/Dockerfile.t5x.amd64 b/.github/container/Dockerfile.t5x.amd64 index 760f641b2..dc180060b 100644 --- a/.github/container/Dockerfile.t5x.amd64 +++ b/.github/container/Dockerfile.t5x.amd64 @@ -40,4 +40,4 @@ ADD test-t5x.sh /usr/local/bin FROM mealkit as final -RUN pip-finalize.sh \ No newline at end of file +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x.arm64 b/.github/container/Dockerfile.t5x.arm64 index 35ed69412..e9d267f01 100644 --- a/.github/container/Dockerfile.t5x.arm64 +++ b/.github/container/Dockerfile.t5x.arm64 @@ -92,4 +92,4 @@ ADD test-t5x.sh /usr/local/bin FROM mealkit as final -RUN pip-finalize.sh \ No newline at end of file +RUN pip-finalize.sh From 81b50cc412bb635cf82395fd1b65715ab95dc8fa Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 3 Dec 2024 10:12:35 -0700 Subject: [PATCH 18/20] [skip ci] Resurect amd64/arm64 dockerfiles: fix whitespace error --- .github/container/Dockerfile.pax.amd64 | 2 +- .github/container/Dockerfile.pax.arm64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 2076b8f1d..52a7723ab 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -50,4 +50,4 @@ ADD test-pax.sh /usr/local/bin FROM mealkit as final -RUN pip-finalize.sh \ No newline at end of file +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index b39344c2c..15cac24b7 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -158,4 +158,4 @@ ADD test-pax.sh /usr/local/bin FROM mealkit as final -RUN pip-finalize.sh \ No newline at end of file +RUN pip-finalize.sh From 96c16a92a662ede08ca43d86c4f760c9946070f4 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Tue, 3 Dec 2024 11:59:27 -0700 Subject: [PATCH 19/20] Add comment for pip install pip-23.3.1 --- .github/container/Dockerfile.base | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 7e5861e1c..9e41e955e 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -144,9 +144,12 @@ EOF # install all python packages system-wide. ENV PIP_BREAK_SYSTEM_PACKAGES=1 +# An extra flag `--ignore-installed` is added below, because of the following reason: +# after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0) +# and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep +# both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above). RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/* - ############################################################################### ## Install TCPx ############################################################################### From 2c1ee0deb97143d64a09643f55147a67a409ab61 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 4 Dec 2024 06:08:51 +0000 Subject: [PATCH 20/20] remove arch-specific Dockerfiles and add pointer to utopian versions --- .github/container/Dockerfile.maxtext | 3 + .github/container/Dockerfile.maxtext.amd64 | 34 ----- .github/container/Dockerfile.maxtext.arm64 | 75 ---------- .github/container/Dockerfile.pax | 3 + .github/container/Dockerfile.pax.amd64 | 53 ------- .github/container/Dockerfile.pax.arm64 | 161 --------------------- .github/container/Dockerfile.t5x | 3 + .github/container/Dockerfile.t5x.amd64 | 43 ------ .github/container/Dockerfile.t5x.arm64 | 95 ------------ 9 files changed, 9 insertions(+), 461 deletions(-) delete mode 100644 .github/container/Dockerfile.maxtext.amd64 delete mode 100644 .github/container/Dockerfile.maxtext.arm64 delete mode 100644 .github/container/Dockerfile.pax.amd64 delete mode 100644 .github/container/Dockerfile.pax.arm64 delete mode 100644 .github/container/Dockerfile.t5x.amd64 delete mode 100644 .github/container/Dockerfile.t5x.arm64 diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 44164a60f..87b73efcd 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -85,3 +85,6 @@ FROM mealkit as final RUN pip-finalize.sh WORKDIR ${SRC_PATH_MAXTEXT} + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 deleted file mode 100644 index 9d08cc360..000000000 --- a/.github/container/Dockerfile.maxtext.amd64 +++ /dev/null @@ -1,34 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG SRC_PATH_MAXTEXT=/opt/maxtext - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_MAXTEXT -ARG SRC_PATH_MAXTEXT - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} -echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in -EOF - -############################################################################### -## Add test script to the path -############################################################################### - -ADD test-maxtext.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh - -WORKDIR ${SRC_PATH_MAXTEXT} diff --git a/.github/container/Dockerfile.maxtext.arm64 b/.github/container/Dockerfile.maxtext.arm64 deleted file mode 100644 index bd64c5b6d..000000000 --- a/.github/container/Dockerfile.maxtext.arm64 +++ /dev/null @@ -1,75 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 -ARG SRC_PATH_MAXTEXT=/opt/maxtext -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -############################################################################### -## build tensorflow-text and lingvo, which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 -ARG SRC_PATH_MAXTEXT -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text - -# Preserve version information of tensorflow-text -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-maxtext.in - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} -echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in -EOF - -############################################################################### -## Add test script to the path -############################################################################### - -ADD test-maxtext.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh - -WORKDIR ${SRC_PATH_MAXTEXT} diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 68afc42ea..938bd853c 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -186,3 +186,6 @@ ADD test-pax.sh /usr/local/bin FROM mealkit as final RUN pip-finalize.sh + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 deleted file mode 100644 index 52a7723ab..000000000 --- a/.github/container/Dockerfile.pax.amd64 +++ /dev/null @@ -1,53 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_PAXML=https://github.com/google/paxml.git#main -ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG SRC_PATH_PAXML=/opt/paxml -ARG SRC_PATH_PRAXIS=/opt/praxis - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_PAXML -ARG URLREF_PRAXIS -ARG SRC_PATH_PAXML -ARG SRC_PATH_PRAXIS - -# update TE manifest file to install the [test] extras -RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/requirements-te.in - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} -git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} -echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in -echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in - -for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do - pushd ${src} - sed -i "s| @ git+https://github.com/google/flax||g" requirements.in - sed -i "s| @ git+https://github.com/google/jax||g" requirements.in - ## we pin etils because newer etils versions are not compatible with the - ## version of TFDS required by Pax - sed -i "s/etils/etils==1.7.0/g" requirements.in - if git diff --quiet; then - echo "URL specs no longer present in select dependencies for ${src}" - exit 1 - else - git commit -a -m "remove URL specs from select dependencies for ${src}" - fi - popd -done -EOF - -ADD test-pax.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 deleted file mode 100644 index 15cac24b7..000000000 --- a/.github/container/Dockerfile.pax.arm64 +++ /dev/null @@ -1,161 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_PAXML=https://github.com/google/paxml.git#main -ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 -ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master -ARG SRC_PATH_PAXML=/opt/paxml -ARG SRC_PATH_PRAXIS=/opt/praxis -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text -ARG SRC_PATH_LINGVO=/opt/lingvo - -############################################################################### -## build tensorflow-text and lingvo, which do not have working arm64 pip wheels -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as wheel-builder - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -FROM wheel-builder as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - -#------------------------------------------------------------------------------ -# build lingvo -#------------------------------------------------------------------------------ - -FROM wheel-builder as lingvo-builder -ARG URLREF_LINGVO -ARG SRC_PATH_TFTEXT -ARG SRC_PATH_LINGVO - -# Preserve the version of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ - -RUN <<"EOF" bash -exu -o pipefail -git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} -EOF - -ENV USE_BAZEL_VERSION=7.1.2 -# build lingvo -RUN <<"EOF" bash -exu -o pipefail -pushd ${SRC_PATH_LINGVO} - -# Use aarch distribution of protobufs -patch -p1 <<"EOFINNER" -diff --git a/lingvo/repo.bzl b/lingvo/repo.bzl -index ce65822d2..d9c0277aa 100644 ---- a/lingvo/repo.bzl -+++ b/lingvo/repo.bzl -@@ -232,9 +232,9 @@ filegroup( - ) - """, - urls = [ -- "https://github.com/protocolbuffers/protobuf/releases/download/v21.9/protoc-21.9-linux-x86_64.zip", -+ "https://github.com/protocolbuffers/protobuf/releases/download/v21.9/protoc-21.9-linux-aarch_64.zip", - ], -- sha256 = "3cd951aff8ce713b94cde55e12378f505f2b89d47bf080508cf77e3934f680b6", -+ sha256 = "a584286dfa8ebb17032ece206ed74d5e9931e2edb9016e427be2a0dab3b21071", - ) - - def icu(): -EOFINNER - -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl -sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt -sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt -sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -pip install -r docker/dev.requirements.txt - -# Some tests are flaky right now, so we skip running the tests. -SKIP_TESTS=1 PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh -EOF - -############################################################################### -## Pax for AArch64 -############################################################################### - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_PAXML -ARG URLREF_PRAXIS -ARG SRC_PATH_PAXML -ARG SRC_PATH_PRAXIS -ARG SRC_PATH_TFTEXT - -# Preserve version information of tensorflow-text and lingvo -COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ -RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in - -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in - -# paxml + praxis -RUN <<"EOF" bash -ex -echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in -echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in -echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in - -git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} -git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} -echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in -echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in - -for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do - pushd ${src} - - for pattern in \ - "s| @ git+https://github.com/google/flax||g" \ - "s| @ git+https://github.com/google/jax||g" \ - "s|^tensorflow|#tensorflow|" \ - "s|^lingvo|#lingvo|" \ - "s|^scikit-learn|#scikit-learn|" \ - "s|^protobuf|#protobuf|" \ - "s|^numpy|#numpy|" \ - ; do - sed -i "${pattern}" */pip_package/requirements.txt requirements.in - done - - if git diff --quiet; then - echo "broken dependencies no longer present in ${src}" - exit 1 - else - git commit -a -m "remove broken dependencies from ${src}" - fi - popd -done -EOF - -ADD test-pax.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 1d6494f09..ea4bbf2ec 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -112,3 +112,6 @@ ADD test-t5x.sh /usr/local/bin FROM mealkit AS final RUN pip-finalize.sh + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.t5x.amd64 b/.github/container/Dockerfile.t5x.amd64 deleted file mode 100644 index dc180060b..000000000 --- a/.github/container/Dockerfile.t5x.amd64 +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_T5X=https://github.com/google-research/t5x.git#main -ARG SRC_PATH_T5X=/opt/t5x - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_T5X -ARG SRC_PATH_T5X - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_T5X} ${SRC_PATH_T5X} -echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in -# This is required because pip can sometimes try to pull every version of seqio-nightly during -# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work -# so setting the lower-bound to something recent -echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in - -# remove head-of-tree specs from select dependencies -pushd ${SRC_PATH_T5X} -sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py -if git diff --quiet; then - echo "URL specs no longer present in select dependencies of t5x" - exit 1 -else - git commit -a -m "remove URL specs from select dependencies of t5x" -fi -popd -EOF - -ADD test-t5x.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x.arm64 b/.github/container/Dockerfile.t5x.arm64 deleted file mode 100644 index e9d267f01..000000000 --- a/.github/container/Dockerfile.t5x.arm64 +++ /dev/null @@ -1,95 +0,0 @@ -# syntax=docker/dockerfile:1-labs -# Example command to build manually: -# docker buildx build -f Dockerfile.t5x.arm64 --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 -ARG URLREF_T5X=https://github.com/google-research/t5x.git#main -ARG SRC_PATH_TFTEXT=/opt/tensorflow-text -ARG SRC_PATH_T5X=/opt/t5x - -############################################################################### -## build several packages which do not have working arm64 pip wheels -############################################################################### - -#------------------------------------------------------------------------------ -# build tensorflow-text from source -#------------------------------------------------------------------------------ - -FROM ${BASE_IMAGE} as tftext-builder -ARG URLREF_TFTEXT -ARG SRC_PATH_TFTEXT -RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 -git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} -cd ${SRC_PATH_TFTEXT} - -# The tftext build script queries GitHub, but these requests are sometimes -# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. -# A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - -# Newer versions of LLVM make lld's --undefined-version check of lld is strict -# by default (https://reviews.llvm.org/D135402), but the tftext build seems to -# rely on this behavior. -echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh - -./oss_scripts/run_build.sh -EOF - - -############################################################################### -## T5X for AArch64 -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_T5X -ARG SRC_PATH_TFTEXT -ARG SRC_PATH_T5X - -# Preserve version information of tensorflow-text -COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml - -COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in - -RUN <<"EOF" bash -ex -# 1. Fetch T5X -git-clone.sh "${URLREF_T5X}" "${SRC_PATH_T5X}" -echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in -# This is required because pip can sometimes try to pull every version of seqio-nightly during -# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work -# so setting the lower-bound to something recent -echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in - -# 2. Remove head-of-tree specs from select dependencies -pushd ${SRC_PATH_T5X} -sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py - -# for ARM64 build -sed -i "s/'tensorflow/#'tensorflow/" setup.py - -sed -i "s/f'jax/#f'jax/" setup.py -sed -i "s/'tpu/#'tpu/" setup.py - -sed -i "s/'protobuf/#'protobuf/" setup.py -sed -i "s/'numpy/#'numpy/" setup.py - -if git diff --quiet; then - echo "URL specs no longer present in select dependencies of t5x" - exit 1 -else - git commit -a -m "remove URL specs from select dependencies of t5x" -fi -popd -EOF - -ADD test-t5x.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous[] stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh