Skip to content

Commit

Permalink
enable nightly publishes per container based on test suite results
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk committed Jan 7, 2025
1 parent 2df533b commit d838d98
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 48 deletions.
48 changes: 18 additions & 30 deletions .github/workflows/docker_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ on:
description: 'specify which sha value the image was built with.'
required: false
default: ''
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'
workflow_call:
inputs:
mode:
Expand All @@ -27,6 +32,11 @@ on:
description: 'specify which sha value the image aws built with.'
required: false
default: ''
arch:
description: 'which images to build [ cpu, cpu-full, pytorch-inf2, pytorch-gpu, tensorrt-llm, lmi, aarch64]'
type: string
required: false
default: '["cpu", "cpu-full", "pytorch-inf2", "pytorch-gpu", "tensorrt-llm", "lmi", "aarch64"]'

permissions:
id-token: write
Expand All @@ -38,25 +48,13 @@ env:
ECR_REPO_REGION: "us-east-1"

jobs:
create-aarch64-runner:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Create new Graviton instance
id: create_aarch64
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_graviton $token djl-serving
outputs:
aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}

nightly-aarch64:
runs-on: [ self-hosted, aarch64 ]
docker-sync:
runs-on: ubuntu-latest
timeout-minutes: 60
needs: create-aarch64-runner
strategy:
fail-fast: false
matrix:
arch: ${{ startsWith(inputs.arch, '[') && fromJson(inputs.arch) || fromJson(format('[{0}]', inputs.arch)) }}
steps:
- uses: actions/checkout@v4
- name: Clean docker env
Expand Down Expand Up @@ -88,11 +86,11 @@ jobs:
- name: Pull and sync to docker hub
working-directory: serving/docker
run: |
./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ inputs.commit_sha }}
./scripts/push_image_from_ECR.sh $DJL_VERSION deepjavalibrary/djl-serving ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }}
- name: Pull and sync to ECR
working-directory: serving/docker
run: |
./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ inputs.commit_sha }}
./scripts/push_image_from_ECR.sh $DJL_VERSION $AWS_STAGING_ECR_REPO ${{ inputs.mode }} ${{ matrix.arch }} ${{ inputs.commit_sha }}
- name: Retag image for release latest
if: ${{ inputs.mode == 'release' }}
working-directory: serving/docker
Expand All @@ -103,13 +101,3 @@ jobs:
working-directory: serving/docker
run: |
yes | docker system prune -a --volumes
stop-aarch64-runner:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [nightly-aarch64, create-aarch64-runner]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-aarch64-runner.outputs.aarch64_instance_id }}
./stop_instance.sh $instance_id
54 changes: 53 additions & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ on:
required: false
type: string
default: ''
outputs:
failure_cpu:
value: ${{ jobs.test.outputs.failure_cpu || '0' }}
failure_gpu:
value: ${{ jobs.test.outputs.failure_gpu || '0' }}
failure_aarch64:
value: ${{ jobs.test.outputs.failure_aarch64 || '0' }}
failure_lmi:
value: ${{ jobs.test.outputs.failure_lmi || '0' }}
failure_trtllm:
value: ${{ jobs.test.outputs.failure_trtllm || '0' }}
failure_neuron:
value: ${{ jobs.test.outputs.failure_neuron || jobs.transformers-neuronx-container-unit-tests.outputs.failure || '0' }}

permissions:
id-token: write
Expand Down Expand Up @@ -125,45 +138,72 @@ jobs:
- test: TestCpuFull
instance: ubuntu-latest
gh-runner: true
failure-prefix: cpu
- test: TestCpuBoth
instance: ubuntu-latest
gh-runner: true
failure-prefix: cpu
- test: TestGpu
instance: g6
failure-prefix: gpu
- test: TestAarch64
instance: aarch64
failure-prefix: aarch64
- test: TestHfHandler
instance: g6
failure-prefix: lmi
- test: TestTrtLlmHandler1
instance: g6
failure-prefix: trtllm
- test: TestTrtLlmHandler2
instance: g6
failure-prefix: trtllm
- test: TestSchedulerSingleGPU
instance: g6
failure-prefix: lmi
- test: TestSchedulerMultiGPU
instance: g6
failure-prefix: lmi
- test: TestLmiDist1
instance: g6
failure-prefix: lmi
- test: TestLmiDist2
instance: g6
failure-prefix: lmi
- test: TestVllm1
instance: g6
failure-prefix: lmi
- test: TestVllmLora
instance: g6
failure-prefix: lmi
- test: TestLmiDistLora
instance: g6
failure-prefix: lmi
- test: TestNeuronx1
instance: inf2
failure-prefix: neuron
- test: TestNeuronx2
instance: inf2
failure-prefix: neuron
- test: TestNeuronxRollingBatch
instance: inf2
failure-prefix: neuron
- test: TestMultiModal
instance: g6
failure-prefix: lmi
- test: TestTextEmbedding
instance: g6
failure-prefix: lmi
- test: TestLmiDistPipelineParallel
instance: g6
failure-prefix: lmi
outputs:
failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
failure_aarch64: ${{ steps.test-failure.outputs.failure_aarch64 }}
failure_lmi: ${{ steps.test-failure.outputs.failure_lmi }}
failure_trtllm: ${{ steps.test-failure.outputs.failure_trtllm }}
failure_neuron: ${{ steps.test-failure.outputs.failure_neuron }}
steps:
- uses: actions/checkout@v4
- name: Clean env
Expand Down Expand Up @@ -215,20 +255,28 @@ jobs:
run: |
ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
python -m pytest -s -k ${{ matrix.test.test }} tests.py
number=$RANDOM
if [ $(( number % 2 )) -eq 0 ]; then
exit 1
else
exit 0
fi
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
id: test-failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
failure_prefix="${{ matrix.test.failure-prefix }}"
echo "failure_${failure_prefix}=1" >> "$GITHUB_OUTPUT"
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
Expand All @@ -245,6 +293,8 @@ jobs:
- SHA-${{ github.sha }}
timeout-minutes: 15
needs: create-runners
outputs:
failure: ${{ steps.failure.outputs.failure }}
steps:
- uses: actions/checkout@v4
- name: Clean env
Expand Down Expand Up @@ -300,10 +350,12 @@ jobs:
# Fail on failed tests
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
- name: On fail step
id: failure
if: ${{ failure() }}
working-directory: engines/python/setup
run: |
cat logs/results.log
echo "failure=1" >> "$GITHUB_OUTPUT"
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
Expand Down
37 changes: 36 additions & 1 deletion .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,45 @@ jobs:
secrets: inherit
with:
tag-suffix: ${{ needs.get_image_tag_suffix.outputs.test_image_tag_suffix }}
determine_images_to_publish:
if: always()
needs: [ integration-test ]
runs-on: ubuntu-latest
outputs:
images: ${{ steps.generate-images.outputs.images }}
steps:
- name: Generate image list from test results
id: generate-images
run: |
images=()
if [[ "${{ needs.integration-test.outputs.failure_cpu }}" == "0" ]]; then
images+=("cpu")
images+=("cpu-full")
fi
if [[ "${{ needs.integration-test.outputs.failure_gpu }}" == "0" ]]; then
images+=("pytorch-gpu")
fi
if [[ "${{ needs.integration-test.outputs.failure_aarch64 }}" == "0" ]]; then
images+=("aarch64")
fi
if [[ "${{ needs.integration-test.outputs.failure_lmi }}" == "0" ]]; then
images+=("lmi")
fi
if [[ "${{ needs.integration-test.outputs.failure_trtllm }}" == "0" ]]; then
images+=("tensorrt-llm")
fi
if [[ "${{ needs.integration-test.outputs.failure_neuron }}" == "0" ]]; then
images+=("pytorch-inf2")
fi
json_images=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${images[@]}")
echo "images are ${json_images}"
echo "images=${json_images}" >> "$GITHUB_OUTPUT"
publish:
needs: [integration-test, get_image_tag_suffix]
if: always()
needs: [determine_images_to_publish]
uses: ./.github/workflows/docker_publish.yml
secrets: inherit
with:
mode: ${{ inputs.mode || 'nightly' }}
commit_sha: ${{ github.sha }}
arch: ${{ needs.determine_images_to_publish.outputs.images }}
30 changes: 14 additions & 16 deletions serving/docker/scripts/push_image_from_ECR.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,25 @@ fi
version=$1
to_repo=$2
mode=$3
commit_sha=${4:-$GITHUB_SHA} # Use parameter expansion for default value
image=$4
commit_sha=${5:-$GITHUB_SHA} # Use parameter expansion for default value

images=(cpu aarch64 cpu-full pytorch-inf2 pytorch-gpu lmi tensorrt-llm)

from_repo=$AWS_TMP_ECR_REPO

set -x
for image in "${images[@]}"; do

if [[ "$mode" == "release" ]]; then
if [[ "$image" == "cpu" ]]; then
tag=$version
else
tag="$version-$image"
fi
if [[ "$mode" == "release" ]]; then
if [[ "$image" == "cpu" ]]; then
tag=$version
else
tag="$version-$image"
fi
fi

if [[ "$mode" == "nightly" ]]; then
tag="$image-nightly"
fi
docker pull $from_repo:$image-$mode-$commit_sha
echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag
echo docker push $to_repo:$tag
done
if [[ "$mode" == "nightly" ]]; then
tag="$image-nightly"
fi
docker pull $from_repo:$image-$mode-$commit_sha
echo docker tag $from_repo:$image-$mode-$commit_sha $to_repo:$tag
echo docker push $to_repo:$tag

0 comments on commit d838d98

Please sign in to comment.