-
Notifications
You must be signed in to change notification settings - Fork 54
442 lines (402 loc) · 17.9 KB
/
ci.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
name: CI
on:
schedule:
- cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC
pull_request:
types:
- opened
- reopened
- ready_for_review
- synchronize
paths-ignore:
- '**.md'
workflow_dispatch:
inputs:
PUBLISH:
type: boolean
description: Publish dated images and update the 'latest' tag?
default: false
required: false
BUMP_MANIFEST:
type: boolean
description: Bump git repos in manifest.yaml to head of tree?
default: true
required: true
MERGE_BUMPED_MANIFEST:
type: boolean
description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch"
default: false
required: false
CUDA_IMAGE:
type: string
description: CUDA image to use as base, e.g. nvidia/cuda:X.Y.Z-devel-ubuntu22.04
default: 'latest'
required: false
SOURCE_OVERRIDES:
type: string
description: |
A comma-separated PACKAGE=URL#REF list to override sources used by build.
PACKAGE∊{JAX,XLA,Flax,transformer-engine,T5X,paxml,praxis,maxtext,levanter,haliax,mujuco,mujuco-mpc,gemma,big-vision,common-loop-utils,flaxformer,panopticapi} (case-insensitive)
default: ''
required: false
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: write # to fetch code and push branch
actions: write # to cancel previous workflows
packages: write # to upload container
pull-requests: write # to make pull request for manifest bump
env:
DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest
jobs:
metadata:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }}
BUMP_MANIFEST: ${{ steps.manifest-branch.outputs.BUMP_MANIFEST }}
MANIFEST_ARTIFACT_NAME: ${{ steps.manifest-branch.outputs.MANIFEST_ARTIFACT_NAME }}
MANIFEST_BRANCH: ${{ steps.manifest-branch.outputs.MANIFEST_BRANCH }}
MERGE_BUMPED_MANIFEST: ${{ steps.manifest-branch.outputs.MERGE_BUMBED_MANIFEST }}
CUDA_IMAGE: ${{ steps.cuda-image.outputs.CUDA_IMAGE }}
steps:
- name: Cancel workflow run if the trigger is a draft PR
id: cancel-if-draft
if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
run: |
echo "Cancelling workflow for draft PR"
curl -X POST -H "Authorization: token ${{ github.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel"
while true; do sleep 1; done # blocks execution in case workflow cancellation takes time
- name: Set build date
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
- name: Determine whether results will be 'published'
id: if-publish
shell: bash -x -e {0}
run: |
echo "PUBLISH=${{ github.event_name == 'schedule' || inputs.PUBLISH }}" >> $GITHUB_OUTPUT
- name: Set manifest branch name
id: manifest-branch
shell: bash -x -e {0}
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
BUMP_MANIFEST="${{ inputs.BUMP_MANIFEST }}"
else
BUMP_MANIFEST="true"
fi
MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }}
# Prepend nightly manifest branch with "z" to make it appear at the end
if [[ "$BUMP_MANIFEST" == "true" ]]; then
# This branch is for scheduled nightlies or manually run nightlies
MANIFEST_BRANCH=znightly-${{ steps.date.outputs.BUILD_DATE }}-${{ github.run_id }}
MANIFEST_ARTIFACT_NAME=${{ env.DEFAULT_MANIFEST_ARTIFACT_NAME }}
else
# This branch is for presubmits (no bumping needed)
MANIFEST_BRANCH=${{ github.sha }}
# Empty artifact name means to use the one in version control
MANIFEST_ARTIFACT_NAME=""
fi
echo "MANIFEST_BRANCH=$MANIFEST_BRANCH" | tee -a $GITHUB_OUTPUT
echo "MANIFEST_ARTIFACT_NAME=$MANIFEST_ARTIFACT_NAME" | tee -a $GITHUB_OUTPUT
echo "BUMP_MANIFEST=$BUMP_MANIFEST" | tee -a $GITHUB_OUTPUT
echo "MERGE_BUMBED_MANIFEST=$MERGE_BUMPED_MANIFEST" | tee -a $GITHUB_OUTPUT
if [[ "$BUMP_MANIFEST" == "false" && "$MERGE_BUMPED_MANIFEST" == "true" ]]; then
echo "Error: If BUMP_MANIFEST=false, MERGE_BUMPED_MANIFEST cannot be true" >&2
exit 1
fi
- name: Determine CUDA image to use
id: cuda-image
shell: bash -x -e {0}
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
CUDA_IMAGE="${{ inputs.CUDA_IMAGE }}"
else
CUDA_IMAGE="latest"
fi
echo "CUDA_IMAGE=${CUDA_IMAGE}" >> $GITHUB_OUTPUT
bump-manifest:
needs: metadata
runs-on: ubuntu-22.04
outputs:
SOURCE_URLREFS: ${{ steps.source-urlrefs.outputs.SOURCE_URLREFS }}
steps:
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4
- name: Test if manifest bump is functional, and save result to a new file
working-directory: .github/container
shell: bash -x -e {0}
run: |
bash bump.sh --input-manifest manifest.yaml --output-manifest manifest.yaml.new --base-patch-dir ./patches-new
- name: Maybe replace current manifest/patches with the new one and show diff
working-directory: .github/container
shell: bash -x -e {0}
run: |
if [[ "${{ needs.metadata.outputs.BUMP_MANIFEST }}" == "true" ]]; then
mv manifest.yaml.new manifest.yaml
rm -rf patches
mv patches-new patches
else
rm -rf patches-new manifest.yaml.new
fi
sed -i 's|file://patches-new/|file://patches/|g' manifest.yaml
git diff
- name: Upload bumped manifest/patches to be used in build-base
if: needs.metadata.outputs.MANIFEST_ARTIFACT_NAME != ''
uses: actions/upload-artifact@v4
with:
name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
path: |
.github/container/manifest.yaml
.github/container/patches
- name: Create URL ref build args
id: source-urlrefs
shell: bash -x -e {0}
run: |
# converts manifest yaml to a json object of {SOFTWARE_NAME: URL#REF, ...}
urlrefs=$(
cat .github/container/manifest.yaml |\
yq -o=json 'to_entries | .[] | select(.value.mode == "git-clone") | {( .key | upcase | sub("-", "_") ): .value.url + "#" + .value.latest_verified_commit}' |\
jq -c -s 'add'
)
# SOURCE_OVERRIDES is a comma-separated list of package=urlref pairs
IFS=, read -ra overrides <<< "${{ inputs.SOURCE_OVERRIDES }}"
for override in "${overrides[@]}"; do
PACKAGE=$(cut -d= -f 1 <<< "${override}" | tr '[:lower:]' '[:upper:]' | tr '-' '_')
URLREF=$(cut -d= -f 2- <<< "${override}")
urlrefs=$(echo "$urlrefs" | jq -c ". + {\"$PACKAGE\": \"$URLREF\"}")
done
echo "SOURCE_URLREFS=${urlrefs}" >> $GITHUB_OUTPUT
amd64:
needs: [metadata, bump-manifest]
uses: ./.github/workflows/_ci.yaml
with:
ARCHITECTURE: amd64
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
secrets: inherit
arm64:
needs: [metadata, bump-manifest]
uses: ./.github/workflows/_ci.yaml
with:
ARCHITECTURE: arm64
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
secrets: inherit
# Only merge if everything succeeds
merge-new-manifest:
runs-on: ubuntu-22.04
if: ${{ !cancelled() && needs.metadata.outputs.MERGE_BUMPED_MANIFEST == 'true' && needs.metadata.outputs.MANIFEST_BRANCH != github.sha }}
needs:
- metadata
- amd64
- arm64
steps:
- name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}"
id: test_result
run:
echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4
- name: Delete checked-out manifest and patches
run: |
rm .github/container/manifest.yaml
rm -rf .github/container/patches
- name: Replace checked-out manifest file/patches with bumped one
uses: actions/download-artifact@v4
with:
name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
path: .github/container/
- name: "Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}"
id: local_branch
shell: bash -x -e {0}
run: |
git config user.name "JAX-Toolbox CI"
git config user.email "jax@nvidia.com"
git switch -c ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
git status
git add .github/container/patches/
git status
# In the unusual situation where the manifest is the same even after bumping,
# we will produce an empty commit with --allow-empty, which allows a PR to be
# made and merged even with no changeset.
git commit --allow-empty -a -m "Nightly Manifest Bump (${{ needs.metadata.outputs.BUILD_DATE }}) from: https://github.com/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }}"
- name: Try to merge manifest branch
id: merge_local
if: steps.test_result.outputs.SUCCEEDED == 'true'
# Merge can fail
continue-on-error: true
shell: bash -x -e {0}
run: |
git switch ${{ github.ref_name }}
# Pull this ref in case it was updated
git pull --rebase
git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
# Push the new change
git push origin ${{ github.ref_name }}
# We will create a Draft PR & remote branch if:
# 1. The tests failed
# 2. The merge failed
- name: Create remote manifest branch
id: create_remote_branch
if: steps.test_result.outputs.SUCCEEDED == 'false' || steps.merge_local.outcome != 'success'
shell: bash -x -e {0}
run: |
# Always abort in case in-progress merge
git merge --abort || true
git switch ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
# Since the merge failed, create a remote and follow up with a PR
git push --set-upstream origin ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
- name: Creating Draft PR for MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }}
id: create_pr
if: steps.test_result.outputs.SUCCEEDED == 'false' || steps.merge_local.outcome != 'success'
uses: octokit/request-action@v2.x
with:
route: POST /repos/{owner_and_repo}/pulls
owner_and_repo: ${{ github.repository }}
head: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}
# Always try to merge back into the branch that triggered this workflow
base: ${{ github.ref }}
body: |
https://github.com/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }}
title: Nightly Manifest Bump (${{ needs.metadata.outputs.BUILD_DATE }})
draft: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}"
if: steps.create_pr.outcome == 'success'
run: |
echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY
# Guard delete in simple check to protect other branches
- name: Check that the branch matches znightly- prefix
run: |
if [[ "${{ needs.metadata.outputs.MANIFEST_BRANCH }}" != znightly-* ]]; then
echo Tried to delete MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }}, but did not start with "znightly-"
exit 1
fi
# If merging fails b/c upstream conflict, branch is deleted to avoid clutter since changeset is preserved in PR
- name: Deleting remote MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }}
# Delete can fail if branch was already deleted or not created, e.g., if the PR successfully merges, then branch is also already deleted.
continue-on-error: true
uses: octokit/request-action@v2.x
with:
route: DELETE /repos/{owner_and_repo}/git/refs/heads/${{ needs.metadata.outputs.MANIFEST_BRANCH }}
owner_and_repo: ${{ github.repository }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
make-publish-configs:
runs-on: ubuntu-22.04
if: ${{ !cancelled() }}
env:
MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }}
FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }}
needs:
- metadata
- amd64
- arm64
outputs:
PUBLISH_CONFIGS: ${{ steps.generate-configs.outputs.PUBLISH_CONFIGS }}
steps:
- id: generate-configs
shell: bash -eu -o pipefail {0}
run: |
declare -a FLAVORS=(
base
jax
triton
equinox
maxtext
levanter
upstream-t5x
upstream-pax
t5x
pax
gemma
)
declare -a STAGES=(
mealkit
final
)
## create JSON specs for a 1D matrix of container publication jobs
ALL_TAGS=$(
echo '${{ needs.amd64.outputs.DOCKER_TAGS }}' \
'${{ needs.arm64.outputs.DOCKER_TAGS }}' |\
jq -s 'add'
)
PUBLISH_CONFIGS='[]'
for stage in "${STAGES[@]}"; do
for flavor in "${FLAVORS[@]}";do
# collect images for different platforms, e.g. amd64 and arm64
matching_tags=$(
echo "$ALL_TAGS" |\
jq -c ".[] | select(.stage == \"${stage}\" and .flavor == \"${flavor}\" and .tag != \"\")"
)
# source_image is a list of all platform-specific tags
source_image=$(echo "${matching_tags}" | jq -c "[.tag]" | jq -s 'add')
# if the build job failed without producing any images, skip this flavor
n_source_images=$(echo "$source_image" | jq 'length')
if [[ $n_source_images -gt 0 ]]; then
echo "PUBLISH image $flavor with $n_source_images $stage containers"
# tag priority is the highest priority of all platform-specific tags
priority=$(echo "${matching_tags}" | jq -r ".priority" | jq -s 'max')
# put all final images in the `ghcr.io/nvidia/jax` namespace
# and mealkit images in `ghcr.io/nvidia/jax-toolbox-mealkit` namespace
case ${stage} in
mealkit)
target_image=${MEALKIT_IMAGE_REPO}
;;
final)
target_image=${FINAL_IMAGE_REPO}
;;
esac
PUBLISH_CONFIGS=$(
echo ${PUBLISH_CONFIGS} | jq -c ". + [{
\"flavor\": \"${flavor}\",
\"target_image\": \"${target_image}\",
\"priority\": \"${priority}\",
\"source_image\": ${source_image},
\"stage\": \"${stage}\"
}]"
)
else
echo "SKIPPED image $flavor with 0 $stage containers"
fi
done
done
PUBLISH_CONFIGS=$(echo "$PUBLISH_CONFIGS" | jq -c '{"config": .}')
echo ${PUBLISH_CONFIGS} | jq
echo "PUBLISH_CONFIGS=${PUBLISH_CONFIGS}" >> $GITHUB_OUTPUT
publish-containers:
needs:
- metadata
- make-publish-configs
if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }}
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }}
uses: ./.github/workflows/_publish_container.yaml
with:
ARTIFACT_NAME: ${{ matrix.config.stage }}-${{ matrix.config.flavor }}
ARTIFACT_TAG: ${{ matrix.config.flavor }}-${{ needs.metadata.outputs.BUILD_DATE }}
SOURCE_IMAGE: ${{ join(matrix.config.source_image, ' ') }}
TARGET_IMAGE: ${{ matrix.config.target_image }}
TARGET_TAGS: |
type=raw,value=${{ matrix.config.flavor }},priority=${{ matrix.config.priority }}
type=raw,value=${{ matrix.config.flavor }}-${{ needs.metadata.outputs.BUILD_DATE }},priority=${{ matrix.config.priority }}
finalize:
needs: [metadata, amd64, arm64, publish-containers]
if: "!cancelled()"
uses: ./.github/workflows/_finalize.yaml
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }}
secrets: inherit