diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index e1d2abc1f..185b6ddb6 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG CLANG_VERSION=18 @@ -60,7 +60,8 @@ apt_packages=( wget jq # llvm.sh - lsb-release software-properties-common + lsb-release + software-properties-common # GCP autoconfig pciutils hwloc bind9-host ) @@ -74,8 +75,6 @@ apt-get install -y ${apt_packages[@]} # Install LLVM/Clang bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION} -apt-get remove -y software-properties-common lsb-release -apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX # Make sure that clang and clang++ point to the new version. This list is based # on the symlinks installed by the `clang` (as opposed to `clang-14`) and `lld` @@ -106,6 +105,21 @@ EOL apt-get clean rm -rf /var/lib/apt/lists/* + +# There are several python packages (in the list below) that are installed with OS +# package manager (the run of `apt-get install` above) and can not be uninstall +# using pip (in pip-finalize.sh script) during JAX installation. Remove then in +# advance to avoid JAX installation issue. +remove_packages=( + python3-gi + software-properties-common + lsb-release + python3-yaml + python3-pygments +) + +apt-get remove -y ${remove_packages[@]} +apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX EOF RUN <<"EOF" bash -ex @@ -129,7 +143,14 @@ git apply > oss_scr ./oss_scripts/run_build.sh EOF - ############################################################################### ## Download source and add auxiliary scripts ############################################################################### FROM ${BASE_IMAGE} as mealkit ARG URLREF_MAXTEXT -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG SRC_PATH_MAXTEXT ARG SRC_PATH_TFTEXT=/opt/tensorflow-text @@ -56,6 +57,17 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip- RUN <<"EOF" bash -ex git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in + +# specify some restrictions to speed up the build and +# avoid pip to download and check all available versions of packages +for pattern in \ + "s|absl-py|absl-py>=2.1.0|g" \ + "s|protobuf==3.20.3|protobuf>=3.19.0|g" \ + "s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \ + ; do + sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt; +done +echo "tensorflow-metadata>=1.15.0" >> ${SRC_PATH_MAXTEXT}/requirements.txt EOF ############################################################################### @@ -73,3 +85,6 @@ FROM mealkit as final RUN pip-finalize.sh WORKDIR ${SRC_PATH_MAXTEXT} + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.maxtext.amd64 b/.github/container/Dockerfile.maxtext.amd64 deleted file mode 100644 index 8289a6099..000000000 --- a/.github/container/Dockerfile.maxtext.amd64 +++ /dev/null @@ -1,34 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main -ARG SRC_PATH_MAXTEXT=/opt/maxtext - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_MAXTEXT -ARG SRC_PATH_MAXTEXT - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT} -echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in -EOF - -############################################################################### -## Add test script to the path -############################################################################### - -ADD test-maxtext.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh - -WORKDIR ${SRC_PATH_MAXTEXT} \ No newline at end of file diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax similarity index 76% rename from .github/container/Dockerfile.pax.arm64 rename to .github/container/Dockerfile.pax index 15cac24b7..938bd853c 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax @@ -3,7 +3,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax ARG URLREF_PAXML=https://github.com/google/paxml.git#main ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis @@ -21,18 +21,19 @@ FROM ${BASE_IMAGE} as wheel-builder # build tensorflow-text from source #------------------------------------------------------------------------------ +# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to @@ -46,6 +47,7 @@ EOF # build lingvo #------------------------------------------------------------------------------ +# Remove Lingvo build from source when it has py-3.12 wheels for x86/arm64 FROM wheel-builder as lingvo-builder ARG URLREF_LINGVO ARG SRC_PATH_TFTEXT @@ -55,15 +57,16 @@ ARG SRC_PATH_LINGVO COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ -RUN <<"EOF" bash -exu -o pipefail -git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} -EOF - ENV USE_BAZEL_VERSION=7.1.2 + # build lingvo RUN <<"EOF" bash -exu -o pipefail +git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO} pushd ${SRC_PATH_LINGVO} +CPU_ARCH="$(dpkg --print-architecture)" +if [[ "${CPU_ARCH}" == "arm64" ]]; then + # Use aarch distribution of protobufs patch -p1 <<"EOFINNER" diff --git a/lingvo/repo.bzl b/lingvo/repo.bzl @@ -84,13 +87,34 @@ index ce65822d2..d9c0277aa 100644 def icu(): EOFINNER -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl -sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt -sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt -sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt +fi + +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl +for pattern in \ + "s|tensorflow=|#tensorflow=|g" \ + "s|tensorflow-text=|#tensorflow-text=|g" \ + "s|dataclasses=|#dataclasses=|g" \ + "s|==.*||g" \ +; do + sed -i "${pattern}" ${SRC_PATH_LINGVO}/docker/dev.requirements.txt +done +# Lingvo support only python < 3.12, so we hack it and update dependencies +# to be able to build for py-3.12 +for pattern in \ + "s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \ + "s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \ + "s|python_requires='>=3.8,<3.11'|python_requires='>=3.8,<3.13'|" \ +; do + sed -i "${pattern}" ${SRC_PATH_LINGVO}/pip_package/setup.py; +done pip install -r docker/dev.requirements.txt # Some tests are flaky right now, so we skip running the tests. +BUILD_ARCH="x86_64" +if [[ "$CPU_ARCH" == "arm64" ]]; then + BUILD_ARCH="aarch64"; +fi +sed -i 's/manylinux2014_x86_64/manylinux_2_38_'"${BUILD_ARCH}"'/' pip_package/build.sh SKIP_TESTS=1 PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh EOF @@ -108,7 +132,7 @@ ARG SRC_PATH_TFTEXT # Preserve version information of tensorflow-text and lingvo COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml -COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ +COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*-linux*.whl /opt/ RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ @@ -116,7 +140,6 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip- # paxml + praxis RUN <<"EOF" bash -ex -echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in @@ -131,11 +154,14 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do for pattern in \ "s| @ git+https://github.com/google/flax||g" \ "s| @ git+https://github.com/google/jax||g" \ + "s| @ git+https://github.com/google/fiddle||g" \ "s|^tensorflow|#tensorflow|" \ "s|^lingvo|#lingvo|" \ "s|^scikit-learn|#scikit-learn|" \ "s|^protobuf|#protobuf|" \ "s|^numpy|#numpy|" \ + "s|^orbax-checkpoint|#orbax-checkpoint|" \ + "s| @ git+https://github.com/google/CommonLoopUtils||g" \ ; do sed -i "${pattern}" */pip_package/requirements.txt requirements.in done @@ -148,6 +174,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do fi popd done +sed -i 's/pysimdjson==[0-9.]*/pysimdjson/' ${SRC_PATH_PAXML}/setup.py EOF ADD test-pax.sh /usr/local/bin @@ -159,3 +186,6 @@ ADD test-pax.sh /usr/local/bin FROM mealkit as final RUN pip-finalize.sh + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 deleted file mode 100644 index 52a7723ab..000000000 --- a/.github/container/Dockerfile.pax.amd64 +++ /dev/null @@ -1,53 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_PAXML=https://github.com/google/paxml.git#main -ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main -ARG SRC_PATH_PAXML=/opt/paxml -ARG SRC_PATH_PRAXIS=/opt/praxis - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_PAXML -ARG URLREF_PRAXIS -ARG SRC_PATH_PAXML -ARG SRC_PATH_PRAXIS - -# update TE manifest file to install the [test] extras -RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/requirements-te.in - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_PAXML} ${SRC_PATH_PAXML} -git-clone.sh ${URLREF_PRAXIS} ${SRC_PATH_PRAXIS} -echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in -echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in - -for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do - pushd ${src} - sed -i "s| @ git+https://github.com/google/flax||g" requirements.in - sed -i "s| @ git+https://github.com/google/jax||g" requirements.in - ## we pin etils because newer etils versions are not compatible with the - ## version of TFDS required by Pax - sed -i "s/etils/etils==1.7.0/g" requirements.in - if git diff --quiet; then - echo "URL specs no longer present in select dependencies for ${src}" - exit 1 - else - git commit -a -m "remove URL specs from select dependencies for ${src}" - fi - popd -done -EOF - -ADD test-pax.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x.arm64 b/.github/container/Dockerfile.t5x similarity index 65% rename from .github/container/Dockerfile.t5x.arm64 rename to .github/container/Dockerfile.t5x index e9d267f01..ea4bbf2ec 100644 --- a/.github/container/Dockerfile.t5x.arm64 +++ b/.github/container/Dockerfile.t5x @@ -1,34 +1,40 @@ # syntax=docker/dockerfile:1-labs # Example command to build manually: -# docker buildx build -f Dockerfile.t5x.arm64 --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . +# docker buildx build -f Dockerfile.t5x --tag t5x --build-arg BASE_IMAGE=ghcr.io/nvidia/jax:mealkit-2024-01-22 . ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0 +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master ARG URLREF_T5X=https://github.com/google-research/t5x.git#main +ARG URLREF_AIRIO=https://github.com/google/airio.git#main ARG SRC_PATH_TFTEXT=/opt/tensorflow-text ARG SRC_PATH_T5X=/opt/t5x +ARG SRC_PATH_AIRIO=/opt/airio + ############################################################################### ## build several packages which do not have working arm64 pip wheels ############################################################################### +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + #------------------------------------------------------------------------------ # build tensorflow-text from source #------------------------------------------------------------------------------ - -FROM ${BASE_IMAGE} as tftext-builder +FROM wheel-builder as tftext-builder ARG URLREF_TFTEXT ARG SRC_PATH_TFTEXT + +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 RUN <<"EOF" bash -exu -o pipefail -pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} cd ${SRC_PATH_TFTEXT} - + # The tftext build script queries GitHub, but these requests are sometimes # throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. # A workaround (needs to be updated when the tensorflow version changes): -sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh - +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh + # Newer versions of LLVM make lld's --undefined-version check of lld is strict # by default (https://reviews.llvm.org/D135402), but the tftext build seems to # rely on this behavior. @@ -39,17 +45,19 @@ EOF ############################################################################### -## T5X for AArch64 +## T5X ############################################################################### -FROM ${BASE_IMAGE} as mealkit +ARG BASE_IMAGE +FROM ${BASE_IMAGE} AS mealkit ARG URLREF_T5X +ARG URLREF_AIRIO ARG SRC_PATH_TFTEXT ARG SRC_PATH_T5X +ARG SRC_PATH_AIRIO # Preserve version information of tensorflow-text COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml - COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-t5x.in @@ -67,20 +75,31 @@ pushd ${SRC_PATH_T5X} sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py # for ARM64 build -sed -i "s/'tensorflow/#'tensorflow/" setup.py - -sed -i "s/f'jax/#f'jax/" setup.py -sed -i "s/'tpu/#'tpu/" setup.py - -sed -i "s/'protobuf/#'protobuf/" setup.py -sed -i "s/'numpy/#'numpy/" setup.py - +if [[ "$(dpkg --print-architecture)" == "arm64" ]]; then + sed -i "s/'tensorflow/#'tensorflow/" setup.py + + sed -i "s/f'jax/#f'jax/" setup.py + sed -i "s/'tpu/#'tpu/" setup.py + + sed -i "s/'protobuf/#'protobuf/" setup.py + sed -i "s/'numpy/#'numpy/" setup.py + + + # airio pins grain==0.2.0, but the later does not have arm64 wheel. + # Need to bump grain to 0.2.2 to resolve the issue (https://github.com/google/airio/issues/257) + git-clone.sh ${URLREF_AIRIO} ${SRC_PATH_AIRIO} + sed -i "s/grain==0.2.0/grain/g" ${SRC_PATH_AIRIO}/setup.py + sed -i "s/'airio/#'airio/g" setup.py + echo "-e file://${SRC_PATH_AIRIO}" >> /opt/pip-tools.d/requirements-t5x.in +fi if git diff --quiet; then echo "URL specs no longer present in select dependencies of t5x" exit 1 else git commit -a -m "remove URL specs from select dependencies of t5x" fi +sed -i 's/pysimdjson==[0-9.]*/pysimdjson/' setup.py +sed -i 's/fasttext==[0-9.]*/fasttext/' setup.py popd EOF @@ -90,6 +109,9 @@ ADD test-t5x.sh /usr/local/bin ## Install accumulated packages from the base image and the previous[] stage ############################################################################### -FROM mealkit as final +FROM mealkit AS final RUN pip-finalize.sh + +# When tftext and lingvo wheels are published on pypi.org, revert this +# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154 diff --git a/.github/container/Dockerfile.t5x.amd64 b/.github/container/Dockerfile.t5x.amd64 deleted file mode 100644 index dc180060b..000000000 --- a/.github/container/Dockerfile.t5x.amd64 +++ /dev/null @@ -1,43 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_T5X=https://github.com/google-research/t5x.git#main -ARG SRC_PATH_T5X=/opt/t5x - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_T5X -ARG SRC_PATH_T5X - -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_T5X} ${SRC_PATH_T5X} -echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/requirements-t5x.in -# This is required because pip can sometimes try to pull every version of seqio-nightly during -# resolution which leads to a ResolutionTooDeep error. The latest nightlies appear to work -# so setting the lower-bound to something recent -echo "seqio-nightly>=0.0.18.dev20240714" >> /opt/pip-tools.d/requirements-t5x.in - -# remove head-of-tree specs from select dependencies -pushd ${SRC_PATH_T5X} -sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py -if git diff --quiet; then - echo "URL specs no longer present in select dependencies of t5x" - exit 1 -else - git commit -a -m "remove URL specs from select dependencies of t5x" -fi -popd -EOF - -ADD test-t5x.sh /usr/local/bin - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 437ca93c6..b9c06e2e6 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -62,8 +62,8 @@ lingvo: tensorflow-text: # Used only in ARM pax and t5x builds url: https://github.com/tensorflow/text.git - tracking_ref: v2.13.0 - latest_verified_commit: 917a681d7220ebf9b62a08b6f9ce7b7db886ddef + tracking_ref: master + latest_verified_commit: 1779b3ae16f7bd287c4edcf66d62208dc63256f3 mode: git-clone pydantic: version: X.Y.Z @@ -78,8 +78,8 @@ fiddle: airio: url: https://github.com/google/airio.git tracking_ref: main - latest_verified_commit: cfca4a10de1491d76d2d00fcbd7142079837ca99 - mode: pip-vcs + latest_verified_commit: 37109ff0d1059f885b9b11ef9058eca5d219d7cb + mode: git-clone clu: url: https://github.com/google/CommonLoopUtils.git tracking_ref: main diff --git a/.github/workflows/_build.yaml b/.github/workflows/_build.yaml index 77d1f6469..83cd2a772 100644 --- a/.github/workflows/_build.yaml +++ b/.github/workflows/_build.yaml @@ -31,7 +31,7 @@ on: required: true DOCKERFILE: type: string - description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x.amd64" + description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x" required: true DOCKER_CONTEXT: type: string diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index a0c6bcfda..2ca5d1a1e 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -79,7 +79,8 @@ jobs: CONTAINER_NAME: triton DOCKERFILE: .github/container/Dockerfile.triton RUNNER_SIZE: large - EXTRA_BUILD_ARGS: URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + EXTRA_BUILD_ARGS: | + URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} secrets: inherit build-equinox: @@ -107,9 +108,10 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.maxtext EXTRA_BUILD_ARGS: | URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} secrets: inherit build-levanter: @@ -138,9 +140,11 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.t5x EXTRA_BUILD_ARGS: | URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} secrets: inherit build-upstream-pax: @@ -153,10 +157,12 @@ jobs: BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-pax - DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + DOCKERFILE: .github/container/Dockerfile.pax EXTRA_BUILD_ARGS: | URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + URLREF_TFTEXT=${{ fromJson(inputs.SOURCE_URLREFS).TENSORFLOW_TEXT }} + URLREF_LINGVO=${{ fromJson(inputs.SOURCE_URLREFS).LINGVO }} secrets: inherit build-rosetta-t5x: @@ -193,11 +199,11 @@ jobs: DOCKERFILE: rosetta/Dockerfile.gemma DOCKER_CONTEXT: . EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).gemma }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).big_vision }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).common_loop_utils }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).flaxformer }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).panopticapi }} + URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} secrets: inherit collect-docker-tags: diff --git a/README.md b/README.md index 538589e5f..78835517b 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ We support and test the following JAX frameworks and model architectures. More d - + @@ -227,7 +227,7 @@ We support and test the following JAX frameworks and model architectures. More d - + @@ -275,7 +275,7 @@ We support and test the following JAX frameworks and model architectures. More d - + diff --git a/rosetta/Dockerfile.gemma b/rosetta/Dockerfile.gemma index 1efc4e719..e7db16dcc 100644 --- a/rosetta/Dockerfile.gemma +++ b/rosetta/Dockerfile.gemma @@ -11,11 +11,46 @@ ARG URLREF_FLAXFORMER=https://github.com/google/flaxformer.git#main ARG SRC_PATH_FLAXFORMER=/opt/flaxformer ARG URLREF_PANOPTICAPI=https://github.com/akolesnikoff/panopticapi.git#mute ARG SRC_PATH_PANOPTICAPI=/opt/panopticapi +ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master +ARG SRC_PATH_TFTEXT=/opt/tensorflow-text + +############################################################################### +## Build several packages which do not have working amd64/arm64 pip wheels +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + +#------------------------------------------------------------------------------ +# build tensorflow-text from source +#------------------------------------------------------------------------------ +FROM wheel-builder as tftext-builder +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT + +RUN <<"EOF" bash -exu -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 +git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT} +cd ${SRC_PATH_TFTEXT} + +# The tftext build script queries GitHub, but these requests are sometimes +# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE. +# A workaround (needs to be updated when the tensorflow version changes): +sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh + +# Newer versions of LLVM make lld's --undefined-version check of lld is strict +# by default (https://reviews.llvm.org/D135402), but the tftext build seems to +# rely on this behavior. +echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scripts/configure.sh + +./oss_scripts/run_build.sh +EOF ############################################################################### ## Download source and add auxiliary scripts ############################################################################### +ARG BASE_IMAGE FROM ${BASE_IMAGE} as mealkit ARG URLREF_GEMMA ARG SRC_PATH_GEMMA @@ -27,7 +62,11 @@ ARG URLREF_FLAXFORMER ARG SRC_PATH_FLAXFORMER ARG URLREF_PANOPTICAPI ARG SRC_PATH_PANOPTICAPI +ARG URLREF_TFTEXT +ARG SRC_PATH_TFTEXT +COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml +COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ RUN <<"EOF" bash -ex git-clone.sh ${URLREF_GEMMA} ${SRC_PATH_GEMMA} @@ -35,11 +74,12 @@ git-clone.sh ${URLREF_BIG_VISION} ${SRC_PATH_BIG_VISION} git-clone.sh ${URLREF_COMMON_LOOP_UTILS} ${SRC_PATH_COMMON_LOOP_UTILS} git-clone.sh ${URLREF_FLAXFORMER} ${SRC_PATH_FLAXFORMER} git-clone.sh ${URLREF_PANOPTICAPI} ${SRC_PATH_PANOPTICAPI} -echo "-e file://${SRC_PATH_GEMMA}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_COMMON_LOOP_UTILS}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_FLAXFORMER}" >> /opt/pip-tools.d/requirements-gemma.in -echo "-e file://${SRC_PATH_PANOPTICAPI}" >> /opt/pip-tools.d/requirements-gemma.in -echo "ipython==8.2 +echo " +-e file://${SRC_PATH_GEMMA} +-e file://${SRC_PATH_COMMON_LOOP_UTILS} +-e file://${SRC_PATH_FLAXFORMER} +-e file://${SRC_PATH_PANOPTICAPI} +ipython==8.2 jupyterlab gcloud overrides @@ -53,12 +93,12 @@ optax protobuf tfds-nightly tensorflow -tensorflow-text +tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl) tensorflow-gan " >> /opt/pip-tools.d/requirements-gemma.in EOF -ENV PYTHONPATH "${SRC_PATH_BIG_VISION}:${PYTHONPATH}" +ENV PYTHONPATH="${SRC_PATH_BIG_VISION}:${PYTHONPATH}" ADD ./rosetta/rosetta/projects/paligemma/Finetune_PaliGemma.ipynb ${SRC_PATH_GEMMA}/examples/Finetune_PaliGemma.ipynb ADD ./rosetta/rosetta/projects/paligemma/test_gemma.py ${SRC_PATH_GEMMA}/tests/test_gemma.py