Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable nightly publishes per container based on test suite results #2654

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 18 additions & 30 deletions .github/workflows/docker_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ on:
description: 'specify which sha value the image was built with.'
required: false
default: ''
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'
workflow_call:
inputs:
mode:
Expand All @@ -27,6 +32,11 @@ on:
description: 'specify which sha value the image aws built with.'
required: false
default: ''
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'

permissions:
id-token: write
Expand All @@ -38,25 +48,13 @@ env:
ECR_REPO_REGION: "us-east-1"

jobs:
create-aarch64-runner:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Create new Graviton instance
id: create_aarch64
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_graviton $token djl-serving
outputs:
aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}

nightly-aarch64:
runs-on: [ self-hosted, aarch64 ]
docker-sync:
runs-on: ubuntu-latest
timeout-minutes: 60
needs: create-aarch64-runner
strategy:
fail-fast: false
matrix:
arch: ${{ startsWith(inputs.arch, '[') && fromJson(inputs.arch) || fromJson(format('[{0}]', inputs.arch)) }}
steps:
- uses: actions/checkout@v4
- name: Clean docker env
Expand Down Expand Up @@ -88,11 +86,11 @@ jobs:
- name: Pull and sync to docker hub
working-directory: serving/docker
run: |
./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ inputs.commit_sha }}
./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }}
- name: Pull and sync to ECR
working-directory: serving/docker
run: |
./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ inputs.commit_sha }}
./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }}
- name: Retag image for release latest
if: ${{ inputs.mode == 'release' }}
working-directory: serving/docker
Expand All @@ -103,13 +101,3 @@ jobs:
working-directory: serving/docker
run: |
yes | docker system prune -a --volumes
stop-aarch64-runner:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [nightly-aarch64, create-aarch64-runner]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-aarch64-runner.outputs.aarch64_instance_id }}
./stop_instance.sh $instance_id
47 changes: 47 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ on:
required: false
type: string
default: ''
outputs:
failure_cpu:
value: ${{ jobs.test.outputs.failure_cpu || '0' }}
failure_gpu:
value: ${{ jobs.test.outputs.failure_gpu || '0' }}
failure_aarch64:
value: ${{ jobs.test.outputs.failure_aarch64 || '0' }}
failure_lmi:
value: ${{ jobs.test.outputs.failure_lmi || '0' }}
failure_trtllm:
value: ${{ jobs.test.outputs.failure_trtllm || '0' }}
failure_neuron:
value: ${{ jobs.test.outputs.failure_neuron || jobs.transformers-neuronx-container-unit-tests.outputs.failure || '0' }}

permissions:
id-token: write
Expand Down Expand Up @@ -125,45 +138,72 @@ jobs:
- test: TestCpuFull
instance: ubuntu-latest
gh-runner: true
failure-prefix: cpu
- test: TestCpuBoth
instance: ubuntu-latest
gh-runner: true
failure-prefix: cpu
- test: TestGpu
instance: g6
failure-prefix: gpu
- test: TestAarch64
instance: aarch64
failure-prefix: aarch64
- test: TestHfHandler
instance: g6
failure-prefix: lmi
- test: TestTrtLlmHandler1
instance: g6
failure-prefix: trtllm
- test: TestTrtLlmHandler2
instance: g6
failure-prefix: trtllm
- test: TestSchedulerSingleGPU
instance: g6
failure-prefix: lmi
- test: TestSchedulerMultiGPU
instance: g6
failure-prefix: lmi
- test: TestLmiDist1
instance: g6
failure-prefix: lmi
- test: TestLmiDist2
instance: g6
failure-prefix: lmi
- test: TestVllm1
instance: g6
failure-prefix: lmi
- test: TestVllmLora
instance: g6
failure-prefix: lmi
- test: TestLmiDistLora
instance: g6
failure-prefix: lmi
- test: TestNeuronx1
instance: inf2
failure-prefix: neuron
- test: TestNeuronx2
instance: inf2
failure-prefix: neuron
- test: TestNeuronxRollingBatch
instance: inf2
failure-prefix: neuron
- test: TestMultiModal
instance: g6
failure-prefix: lmi
- test: TestTextEmbedding
instance: g6
failure-prefix: lmi
- test: TestLmiDistPipelineParallel
instance: g6
failure-prefix: lmi
outputs:
failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
failure_aarch64: ${{ steps.test-failure.outputs.failure_aarch64 }}
failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
failure_trtllm: ${{ steps.test-failure.outputs.failure_trtllm }}
failure_neuron: ${{ steps.test-failure.outputs.failure_neuron }}
steps:
- uses: actions/checkout@v4
- name: Clean env
Expand Down Expand Up @@ -222,13 +262,16 @@ jobs:
rm -rf outputs
rm awscurl
- name: On Failure
id: test-failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
failure_prefix="${{ matrix.test.failure-prefix }}"
echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT"
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
Expand All @@ -245,6 +288,8 @@ jobs:
- SHA-${{ github.sha }}
timeout-minutes: 15
needs: create-runners
outputs:
failure: ${{ steps.failure.outputs.failure }}
steps:
- uses: actions/checkout@v4
- name: Clean env
Expand Down Expand Up @@ -300,10 +345,12 @@ jobs:
# Fail on failed tests
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
- name: On fail step
id: failure
if: ${{ failure() }}
working-directory: engines/python/setup
run: |
cat logs/results.log
echo "failure=1" >> "$GITHUB_OUTPUT"
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
Expand Down
37 changes: 36 additions & 1 deletion .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,45 @@ jobs:
secrets: inherit
with:
tag-suffix: ${{ needs.get_image_tag_suffix.outputs.test_image_tag_suffix }}
determine_images_to_publish:
if: always()
needs: [ integration-test ]
runs-on: ubuntu-latest
outputs:
images: ${{ steps.generate-images.outputs.images }}
steps:
- name: Generate image list from test results
id: generate-images
run: |
images=()
if [[ "${{ needs.integration-test.outputs.failure_cpu }}" == "0" ]]; then
images+=("cpu")
images+=("cpu-full")
fi
if [[ "${{ needs.integration-test.outputs.failure_gpu }}" == "0" ]]; then
images+=("pytorch-gpu")
fi
if [[ "${{ needs.integration-test.outputs.failure_aarch64 }}" == "0" ]]; then
images+=("aarch64")
fi
if [[ "${{ needs.integration-test.outputs.failure_lmi }}" == "0" ]]; then
images+=("lmi")
fi
if [[ "${{ needs.integration-test.outputs.failure_trtllm }}" == "0" ]]; then
images+=("tensorrt-llm")
fi
if [[ "${{ needs.integration-test.outputs.failure_neuron }}" == "0" ]]; then
images+=("pytorch-inf2")
fi
json_images=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${images[@]}")
echo "images are ${json_images}"
echo "images=${json_images}" >> "$GITHUB_OUTPUT"
publish:
needs: [integration-test, get_image_tag_suffix]
if: always()
needs: [determine_images_to_publish]
uses: ./.github/workflows/docker_publish.yml
secrets: inherit
with:
mode: ${{ inputs.mode || 'nightly' }}
commit_sha: ${{ github.sha }}
arch: ${{ needs.determine_images_to_publish.outputs.images }}
30 changes: 14 additions & 16 deletions serving/docker/scripts/push_image_from_ECR.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,25 @@ fi
version=$1
to_repo=$2
mode=$3
commit_sha=${4:-$GITHUB_SHA} # Use parameter expansion for default value
image=$4
commit_sha=${5:-$GITHUB_SHA} # Use parameter expansion for default value

images=(cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu lmi tensorrt-llm)

from_repo=$AWS_TMP_ECR_REPO

set -x
for image in "${images[@]}"; do

if [[ "$mode" == "release" ]]; then
if [[ "$image" == "cpu" ]]; then
tag=$version
else
tag="$version-$image"
fi
if [[ "$mode" == "release" ]]; then
if [[ "$image" == "cpu" ]]; then
tag=$version
else
tag="$version-$image"
fi
fi

if [[ "$mode" == "nightly" ]]; then
tag="$image-nightly"
fi
docker pull $from_repo:$image-$mode-$commit_sha
echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag
echo docker push $to_repo:$tag
done
if [[ "$mode" == "nightly" ]]; then
tag="$image-nightly"
fi
docker pull $from_repo:$image-$mode-$commit_sha
echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag
docker push $to_repo:$tag
Loading