diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 67bbd027..3d097bcd 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: conda-python-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -38,7 +38,7 @@ jobs: if: github.ref_type == 'branch' needs: [conda-python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -51,7 +51,7 @@ jobs: upload-conda: needs: [conda-python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -59,7 +59,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -72,7 +72,7 @@ jobs: wheel-publish: needs: wheel-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 76014652..0e20bdaf 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -18,26 +18,26 @@ jobs: - docs-build - wheel-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 conda-python-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -46,7 +46,7 @@ jobs: run_script: "ci/build_docs.sh" wheel-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request # Package is pure Python and only ever requires one build. diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1a0e7d87..631a6173 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-python-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4707492a..a2202df3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: hooks: - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/CHANGELOG.md b/CHANGELOG.md index f8c992fb..3b0d08d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,29 @@ +# dask-cuda 24.12.00 (11 Dec 2024) + +## 🚨 Breaking Changes + +- Add warmup runs and profile all iterations to benchmarks ([#1402](https://github.com/rapidsai/dask-cuda/pull/1402)) [@pentschev](https://github.com/pentschev) + +## 🐛 Bug Fixes + +- Disable UCXX tests in CI ([#1406](https://github.com/rapidsai/dask-cuda/pull/1406)) [@pentschev](https://github.com/pentschev) +- Ignore legacy Dask dataframe warnings ([#1397](https://github.com/rapidsai/dask-cuda/pull/1397)) [@pentschev](https://github.com/pentschev) +- Reenable UCXX in CI ([#1396](https://github.com/rapidsai/dask-cuda/pull/1396)) [@pentschev](https://github.com/pentschev) + +## 🚀 New Features + +- Enable Pytorch to share same memory pool as RMM via cli ([#1392](https://github.com/rapidsai/dask-cuda/pull/1392)) [@VibhuJawa](https://github.com/VibhuJawa) + +## 🛠️ Improvements + +- enforce wheel size limits, README formatting in CI ([#1404](https://github.com/rapidsai/dask-cuda/pull/1404)) [@jameslamb](https://github.com/jameslamb) +- Add warmup runs and profile all iterations to benchmarks ([#1402](https://github.com/rapidsai/dask-cuda/pull/1402)) [@pentschev](https://github.com/pentschev) +- remove unnecessary cmake and sccache configuration ([#1400](https://github.com/rapidsai/dask-cuda/pull/1400)) [@jameslamb](https://github.com/jameslamb) +- make conda installs in CI stricter ([#1395](https://github.com/rapidsai/dask-cuda/pull/1395)) [@jameslamb](https://github.com/jameslamb) +- Limit output of pytest durations ([#1393](https://github.com/rapidsai/dask-cuda/pull/1393)) [@pentschev](https://github.com/pentschev) +- Switch pytest `traceback` to `native` ([#1389](https://github.com/rapidsai/dask-cuda/pull/1389)) [@galipremsagar](https://github.com/galipremsagar) +- Update PyNVML and set upper pin ([#1130](https://github.com/rapidsai/dask-cuda/pull/1130)) [@wence-](https://github.com/wence-) + # dask-cuda 24.10.00 (9 Oct 2024) ## 🚨 Breaking Changes diff --git a/VERSION b/VERSION index 7c7ba044..af28c42b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.10.00 +24.12.00 diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 42103004..58da36c7 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -5,6 +5,8 @@ set -euo pipefail rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-dependency-file-generator \ --output conda \ --file-key docs \ @@ -21,9 +23,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${PYTHON_CHANNEL}" \ - dask-cuda + "dask-cuda=${RAPIDS_VERSION}" -export RAPIDS_VERSION_NUMBER="24.10" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build Python docs" @@ -33,4 +34,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html" popd -rapids-upload-docs +RAPIDS_VERSION_NUMBER="$(rapids-version-major-minor)" rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index 48cece32..c12a0dde 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -5,12 +5,8 @@ set -euo pipefail rapids-configure-conda-channels -source rapids-configure-sccache - source rapids-date-string -export CMAKE_GENERATOR=Ninja - rapids-print-env rapids-generate-version > ./VERSION diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 828972dc..760e46e3 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,11 +3,11 @@ set -euo pipefail -source rapids-configure-sccache source rapids-date-string rapids-generate-version > ./VERSION -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check +./ci/validate_wheel.sh dist RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 2dbe504c..b229d280 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -68,7 +68,6 @@ done for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done -sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh # Docs referencing source code find docs/source/ -type f -name *.rst -print0 | while IFS= read -r -d '' filename; do diff --git a/ci/test_python.sh b/ci/test_python.sh index 78330a40..319efef2 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ --output conda \ @@ -29,7 +31,7 @@ rapids-print-env rapids-mamba-retry install \ --channel "${PYTHON_CHANNEL}" \ - dask-cuda + "dask-cuda=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi @@ -50,9 +52,9 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \ UCXPY_IFNAME=eth0 \ UCX_WARN_UNUSED_ENV_VARS=n \ UCX_MEMTYPE_CACHE=n \ -timeout 60m pytest \ +timeout 90m pytest \ -vv \ - --durations=0 \ + --durations=50 \ --capture=no \ --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \ @@ -71,9 +73,9 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \ UCXPY_IFNAME=eth0 \ UCX_WARN_UNUSED_ENV_VARS=n \ UCX_MEMTYPE_CACHE=n \ -timeout 30m pytest \ +timeout 60m pytest \ -vv \ - --durations=0 \ + --durations=50 \ --capture=no \ --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda-legacy.xml" \ diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 00000000..60a80fce --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +wheel_dir_relative_path=$1 + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml index 3cfd9cb2..c7b20c69 100644 --- a/conda/environments/all_cuda-114_arch-x86_64.yaml +++ b/conda/environments/all_cuda-114_arch-x86_64.yaml @@ -10,28 +10,28 @@ dependencies: - click >=8.1 - cuda-version=11.4 - cudatoolkit -- cudf==24.10.*,>=0.0.0a0 -- dask-cudf==24.10.*,>=0.0.0a0 -- distributed-ucxx==0.40.*,>=0.0.0a0 -- kvikio==24.10.*,>=0.0.0a0 +- cudf==24.12.*,>=0.0.0a0 +- dask-cudf==24.12.*,>=0.0.0a0 +- distributed-ucxx==0.41.*,>=0.0.0a0 +- kvikio==24.12.*,>=0.0.0a0 - numactl-devel-cos7-x86_64 - numba>=0.57 - numpy>=1.23,<3.0a0 - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<11.5 +- pynvml>=11.0.0,<12.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 - rapids-build-backend>=0.3.0,<0.4.0dev0 -- rapids-dask-dependency==24.10.*,>=0.0.0a0 +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - setuptools>=64.0.0 - sphinx - sphinx-click>=2.7.1 - sphinx-rtd-theme>=0.5.1 - ucx-proc=*=gpu -- ucx-py==0.40.*,>=0.0.0a0 -- ucxx==0.40.*,>=0.0.0a0 +- ucx-py==0.41.*,>=0.0.0a0 +- ucxx==0.41.*,>=0.0.0a0 - zict>=2.0.0 name: all_cuda-114_arch-x86_64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b7b99751..9fd24d4e 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -10,28 +10,28 @@ dependencies: - click >=8.1 - cuda-version=11.8 - cudatoolkit -- cudf==24.10.*,>=0.0.0a0 -- dask-cudf==24.10.*,>=0.0.0a0 -- distributed-ucxx==0.40.*,>=0.0.0a0 -- kvikio==24.10.*,>=0.0.0a0 +- cudf==24.12.*,>=0.0.0a0 +- dask-cudf==24.12.*,>=0.0.0a0 +- distributed-ucxx==0.41.*,>=0.0.0a0 +- kvikio==24.12.*,>=0.0.0a0 - numactl-devel-cos7-x86_64 - numba>=0.57 - numpy>=1.23,<3.0a0 - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<11.5 +- pynvml>=11.0.0,<12.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 - rapids-build-backend>=0.3.0,<0.4.0dev0 -- rapids-dask-dependency==24.10.*,>=0.0.0a0 +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - setuptools>=64.0.0 - sphinx - sphinx-click>=2.7.1 - sphinx-rtd-theme>=0.5.1 - ucx-proc=*=gpu -- ucx-py==0.40.*,>=0.0.0a0 -- ucxx==0.40.*,>=0.0.0a0 +- ucx-py==0.41.*,>=0.0.0a0 +- ucxx==0.41.*,>=0.0.0a0 - zict>=2.0.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 652a8f0c..cd7c1679 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -11,28 +11,28 @@ dependencies: - cuda-nvcc-impl - cuda-nvrtc - cuda-version=12.5 -- cudf==24.10.*,>=0.0.0a0 -- dask-cudf==24.10.*,>=0.0.0a0 -- distributed-ucxx==0.40.*,>=0.0.0a0 -- kvikio==24.10.*,>=0.0.0a0 +- cudf==24.12.*,>=0.0.0a0 +- dask-cudf==24.12.*,>=0.0.0a0 +- distributed-ucxx==0.41.*,>=0.0.0a0 +- kvikio==24.12.*,>=0.0.0a0 - numactl-devel-cos7-x86_64 - numba>=0.57 - numpy>=1.23,<3.0a0 - numpydoc>=1.1.0 - pandas>=1.3 - pre-commit -- pynvml>=11.0.0,<11.5 +- pynvml>=11.0.0,<12.0.0a0 - pytest - pytest-cov - python>=3.10,<3.13 - rapids-build-backend>=0.3.0,<0.4.0dev0 -- rapids-dask-dependency==24.10.*,>=0.0.0a0 +- rapids-dask-dependency==24.12.*,>=0.0.0a0 - setuptools>=64.0.0 - sphinx - sphinx-click>=2.7.1 - sphinx-rtd-theme>=0.5.1 - ucx-proc=*=gpu -- ucx-py==0.40.*,>=0.0.0a0 -- ucxx==0.40.*,>=0.0.0a0 +- ucx-py==0.41.*,>=0.0.0a0 +- ucxx==0.41.*,>=0.0.0a0 - zict>=2.0.0 name: all_cuda-125_arch-x86_64 diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py index 7f48d4fa..49676fee 100644 --- a/dask_cuda/benchmarks/common.py +++ b/dask_cuda/benchmarks/common.py @@ -1,3 +1,4 @@ +import contextlib from argparse import Namespace from functools import partial from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple @@ -7,7 +8,7 @@ import pandas as pd import dask -from distributed import Client +from distributed import Client, performance_report from dask_cuda.benchmarks.utils import ( address_to_index, @@ -87,12 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config): If ``args.profile`` is set, the final run is profiled. """ + results = [] - for _ in range(max(1, args.runs) - 1): - res = config.bench_once(client, args, write_profile=None) - results.append(res) - results.append(config.bench_once(client, args, write_profile=args.profile)) - return results + for _ in range(max(0, args.warmup_runs)): + config.bench_once(client, args, write_profile=None) + + ctx = contextlib.nullcontext() + if args.profile is not None: + ctx = performance_report(filename=args.profile) + with ctx: + for _ in range(max(1, args.runs) - 1): + res = config.bench_once(client, args, write_profile=None) + results.append(res) + results.append(config.bench_once(client, args, write_profile=args.profile_last)) + return results def gather_bench_results(client: Client, args: Namespace, config: Config): diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py index f094ff18..a9e7d833 100644 --- a/dask_cuda/benchmarks/local_cudf_groupby.py +++ b/dask_cuda/benchmarks/local_cudf_groupby.py @@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None): "False": False, }.get(args.shuffle, args.shuffle) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: t1 = clock() @@ -260,12 +259,6 @@ def parse_args(): "type": str, "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, ] return parse_benchmark_args( diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py index e2b03520..6ebe005a 100644 --- a/dask_cuda/benchmarks/local_cudf_merge.py +++ b/dask_cuda/benchmarks/local_cudf_merge.py @@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None): if args.backend == "explicit-comms": ctx1 = dask.config.set(explicit_comms=True) if write_profile is not None: - ctx2 = performance_report(filename=args.profile) + ctx2 = performance_report(filename=write_profile) with ctx1: with ctx2: @@ -346,12 +346,6 @@ def parse_args(): "action": "store_true", "help": "Don't shuffle the keys of the left (base) dataframe.", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": [ "-s", diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py index 25f42e59..3a0955c4 100644 --- a/dask_cuda/benchmarks/local_cudf_shuffle.py +++ b/dask_cuda/benchmarks/local_cudf_shuffle.py @@ -121,10 +121,9 @@ def create_data( def bench_once(client, args, write_profile=None): data_processed, df = create_data(client, args) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: if args.backend in {"dask", "dask-noop"}: @@ -228,12 +227,6 @@ def parse_args(): "type": str, "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": "--ignore-index", "action": "store_true", diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py index c9c8fe1c..ba88db30 100644 --- a/dask_cuda/benchmarks/local_cupy.py +++ b/dask_cuda/benchmarks/local_cupy.py @@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None): chunksize = x.chunksize data_processed = sum(arg.nbytes for arg in func_args) - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: rng = start_range(message=args.operation, color="purple") result = func(*func_args) @@ -297,12 +296,6 @@ def parse_args(): "type": int, "help": "Chunk size (default 2500).", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs (default 3).", - }, { "name": [ "-b", diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py index 8b975a24..ecefa52a 100644 --- a/dask_cuda/benchmarks/local_cupy_map_overlap.py +++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py @@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None): data_processed = x.nbytes - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: result = x.map_overlap(mean_filter, args.kernel_size, shape=ks) if args.backend == "dask-noop": @@ -168,12 +167,6 @@ def parse_args(): "type": int, "help": "Kernel size, 2*k+1, in each dimension (default 1)", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": [ "-b", diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py index de7e2ae1..4f87a025 100644 --- a/dask_cuda/benchmarks/utils.py +++ b/dask_cuda/benchmarks/utils.py @@ -323,7 +323,16 @@ def parse_benchmark_args( metavar="PATH", default=None, type=str, - help="Write dask profile report (E.g. dask-report.html)", + help="Write dask profile report (E.g. dask-report.html) on all " + "iterations (excluding warmup).", + ) + parser.add_argument( + "--profile-last", + metavar="PATH", + default=None, + type=str, + help="Write dask profile report (E.g. dask-report.html) on last " + "iteration only.", ) # See save_benchmark_data for more information parser.add_argument( @@ -344,6 +353,18 @@ def parse_benchmark_args( type=parse_bytes, help="Bandwidth statistics: ignore messages smaller than this (default '1 MB')", ) + parser.add_argument( + "--runs", + default=3, + type=int, + help="Number of runs", + ) + parser.add_argument( + "--warmup-runs", + default=1, + type=int, + help="Number of warmup runs", + ) for args in args_list: name = args.pop("name") diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py index a8c6d972..8101f020 100644 --- a/dask_cuda/cli.py +++ b/dask_cuda/cli.py @@ -13,7 +13,7 @@ from distributed.utils import import_term from .cuda_worker import CUDAWorker -from .utils import print_cluster_config +from .utils import CommaSeparatedChoice, print_cluster_config logger = logging.getLogger(__name__) @@ -164,6 +164,16 @@ def cuda(): incompatible with RMM pools and managed memory, trying to enable both will result in failure.""", ) +@click.option( + "--set-rmm-allocator-for-libs", + "rmm_allocator_external_lib_list", + type=CommaSeparatedChoice(["cupy", "torch"]), + default=None, + show_default=True, + help=""" + Set RMM as the allocator for external libraries. Provide a comma-separated + list of libraries to set, e.g., "torch,cupy".""", +) @click.option( "--rmm-release-threshold", default=None, @@ -351,6 +361,7 @@ def worker( rmm_maximum_pool_size, rmm_managed_memory, rmm_async, + rmm_allocator_external_lib_list, rmm_release_threshold, rmm_log_directory, rmm_track_allocations, @@ -425,6 +436,7 @@ def worker( rmm_maximum_pool_size, rmm_managed_memory, rmm_async, + rmm_allocator_external_lib_list, rmm_release_threshold, rmm_log_directory, rmm_track_allocations, diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py index 3e03ed29..30c14450 100644 --- a/dask_cuda/cuda_worker.py +++ b/dask_cuda/cuda_worker.py @@ -47,6 +47,7 @@ def __init__( rmm_maximum_pool_size=None, rmm_managed_memory=False, rmm_async=False, + rmm_allocator_external_lib_list=None, rmm_release_threshold=None, rmm_log_directory=None, rmm_track_allocations=False, @@ -231,6 +232,7 @@ def del_pid_file(): release_threshold=rmm_release_threshold, log_directory=rmm_log_directory, track_allocations=rmm_track_allocations, + external_lib_list=rmm_allocator_external_lib_list, ), PreImport(pre_import), CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats), diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py index c037223b..7a24df43 100644 --- a/dask_cuda/local_cuda_cluster.py +++ b/dask_cuda/local_cuda_cluster.py @@ -143,6 +143,11 @@ class LocalCUDACluster(LocalCluster): The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also incompatible with RMM pools and managed memory. Trying to enable both will result in an exception. + rmm_allocator_external_lib_list: str, list or None, default None + List of external libraries for which to set RMM as the allocator. + Supported options are: ``["torch", "cupy"]``. Can be a comma-separated string + (like ``"torch,cupy"``) or a list of strings (like ``["torch", "cupy"]``). + If ``None``, no external libraries will use RMM as their allocator. rmm_release_threshold: int, str or None, default None When ``rmm.async is True`` and the pool size grows beyond this value, unused memory held by the pool will be released at the next synchronization point. @@ -231,6 +236,7 @@ def __init__( rmm_maximum_pool_size=None, rmm_managed_memory=False, rmm_async=False, + rmm_allocator_external_lib_list=None, rmm_release_threshold=None, rmm_log_directory=None, rmm_track_allocations=False, @@ -265,6 +271,19 @@ def __init__( n_workers = len(CUDA_VISIBLE_DEVICES) if n_workers < 1: raise ValueError("Number of workers cannot be less than 1.") + + if rmm_allocator_external_lib_list is not None: + if isinstance(rmm_allocator_external_lib_list, str): + rmm_allocator_external_lib_list = [ + v.strip() for v in rmm_allocator_external_lib_list.split(",") + ] + elif not isinstance(rmm_allocator_external_lib_list, list): + raise ValueError( + "rmm_allocator_external_lib_list must be either a comma-separated " + "string or a list of strings. Examples: 'torch,cupy' " + "or ['torch', 'cupy']" + ) + # Set nthreads=1 when parsing mem_limit since it only depends on n_workers logger = logging.getLogger(__name__) self.memory_limit = parse_memory_limit( @@ -284,6 +303,8 @@ def __init__( self.rmm_managed_memory = rmm_managed_memory self.rmm_async = rmm_async self.rmm_release_threshold = rmm_release_threshold + self.rmm_allocator_external_lib_list = rmm_allocator_external_lib_list + if rmm_pool_size is not None or rmm_managed_memory or rmm_async: try: import rmm # noqa F401 @@ -437,6 +458,7 @@ def new_worker_spec(self): release_threshold=self.rmm_release_threshold, log_directory=self.rmm_log_directory, track_allocations=self.rmm_track_allocations, + external_lib_list=self.rmm_allocator_external_lib_list, ), PreImport(self.pre_import), CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats), diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py index 122f93ff..cd1928af 100644 --- a/dask_cuda/plugins.py +++ b/dask_cuda/plugins.py @@ -1,5 +1,6 @@ import importlib import os +from typing import Callable, Dict from distributed import WorkerPlugin @@ -39,6 +40,7 @@ def __init__( release_threshold, log_directory, track_allocations, + external_lib_list, ): if initial_pool_size is None and maximum_pool_size is not None: raise ValueError( @@ -61,6 +63,7 @@ def __init__( self.logging = log_directory is not None self.log_directory = log_directory self.rmm_track_allocations = track_allocations + self.external_lib_list = external_lib_list def setup(self, worker=None): if self.initial_pool_size is not None: @@ -123,6 +126,70 @@ def setup(self, worker=None): mr = rmm.mr.get_current_device_resource() rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr)) + if self.external_lib_list is not None: + for lib in self.external_lib_list: + enable_rmm_memory_for_library(lib) + + +def enable_rmm_memory_for_library(lib_name: str) -> None: + """Enable RMM memory pool support for a specified third-party library. + + This function allows the given library to utilize RMM's memory pool if it supports + integration with RMM. The library name is passed as a string argument, and if the + library is compatible, its memory allocator will be configured to use RMM. + + Parameters + ---------- + lib_name : str + The name of the third-party library to enable RMM memory pool support for. + Supported libraries are "cupy" and "torch". + + Raises + ------ + ValueError + If the library name is not supported or does not have RMM integration. + ImportError + If the required library is not installed. + """ + + # Mapping of supported libraries to their respective setup functions + setup_functions: Dict[str, Callable[[], None]] = { + "torch": _setup_rmm_for_torch, + "cupy": _setup_rmm_for_cupy, + } + + if lib_name not in setup_functions: + supported_libs = ", ".join(setup_functions.keys()) + raise ValueError( + f"The library '{lib_name}' is not supported for RMM integration. " + f"Supported libraries are: {supported_libs}." + ) + + # Call the setup function for the specified library + setup_functions[lib_name]() + + +def _setup_rmm_for_torch() -> None: + try: + import torch + except ImportError as e: + raise ImportError("PyTorch is not installed.") from e + + from rmm.allocators.torch import rmm_torch_allocator + + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + +def _setup_rmm_for_cupy() -> None: + try: + import cupy + except ImportError as e: + raise ImportError("CuPy is not installed.") from e + + from rmm.allocators.cupy import rmm_cupy_allocator + + cupy.cuda.set_allocator(rmm_cupy_allocator) + class PreImport(WorkerPlugin): def __init__(self, libraries): diff --git a/dask_cuda/tests/pytest.ini b/dask_cuda/tests/pytest.ini new file mode 100644 index 00000000..7b0a9f29 --- /dev/null +++ b/dask_cuda/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py index ff4dbbae..74596fe2 100644 --- a/dask_cuda/utils.py +++ b/dask_cuda/utils.py @@ -9,6 +9,7 @@ from multiprocessing import cpu_count from typing import Optional +import click import numpy as np import pynvml import toolz @@ -764,3 +765,13 @@ def get_rmm_memory_resource_stack(mr) -> list: if isinstance(mr, rmm.mr.StatisticsResourceAdaptor): return mr.allocation_counts["current_bytes"] return None + + +class CommaSeparatedChoice(click.Choice): + def convert(self, value, param, ctx): + values = [v.strip() for v in value.split(",")] + for v in values: + if v not in self.choices: + choices_str = ", ".join(f"'{c}'" for c in self.choices) + self.fail(f"invalid choice(s): {v}. (choices are: {choices_str})") + return values diff --git a/dependencies.yaml b/dependencies.yaml index 9e6b3a10..fa6a56e0 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -157,8 +157,8 @@ dependencies: - numba>=0.57 - numpy>=1.23,<3.0a0 - pandas>=1.3 - - pynvml>=11.0.0,<11.5 - - rapids-dask-dependency==24.10.*,>=0.0.0a0 + - pynvml>=11.0.0,<12.0.0a0 + - rapids-dask-dependency==24.12.*,>=0.0.0a0 - zict>=2.0.0 test_python: common: @@ -168,13 +168,13 @@ dependencies: - pytest-cov - output_types: [conda] packages: - - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0 - - &dask_cudf_unsuffixed dask-cudf==24.10.*,>=0.0.0a0 - - distributed-ucxx==0.40.*,>=0.0.0a0 - - &kvikio_unsuffixed kvikio==24.10.*,>=0.0.0a0 - - &ucx_py_unsuffixed ucx-py==0.40.*,>=0.0.0a0 + - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0 + - &dask_cudf_unsuffixed dask-cudf==24.12.*,>=0.0.0a0 + - distributed-ucxx==0.41.*,>=0.0.0a0 + - &kvikio_unsuffixed kvikio==24.12.*,>=0.0.0a0 + - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0 - ucx-proc=*=gpu - - ucxx==0.40.*,>=0.0.0a0 + - ucxx==0.41.*,>=0.0.0a0 specific: - output_types: conda matrices: @@ -194,16 +194,16 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==24.10.*,>=0.0.0a0 - - dask-cudf-cu12==24.10.*,>=0.0.0a0 - - ucx-py-cu12==0.40.*,>=0.0.0a0 + - cudf-cu12==24.12.*,>=0.0.0a0 + - dask-cudf-cu12==24.12.*,>=0.0.0a0 + - ucx-py-cu12==0.41.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf-cu11==24.10.*,>=0.0.0a0 - - dask-cudf-cu11==24.10.*,>=0.0.0a0 - - ucx-py-cu11==0.40.*,>=0.0.0a0 + - cudf-cu11==24.12.*,>=0.0.0a0 + - dask-cudf-cu11==24.12.*,>=0.0.0a0 + - ucx-py-cu11==0.41.*,>=0.0.0a0 - matrix: packages: - *cudf_unsuffixed diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst index af317056..db621977 100644 --- a/docs/source/explicit_comms.rst +++ b/docs/source/explicit_comms.rst @@ -14,4 +14,4 @@ Usage In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"`` key in the `Dask configuration `_. -It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle `_ for guidance. +It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle `_ for guidance. diff --git a/pyproject.toml b/pyproject.toml index 730225ad..f6332875 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,8 +20,8 @@ dependencies = [ "numba>=0.57", "numpy>=1.23,<3.0a0", "pandas>=1.3", - "pynvml>=11.0.0,<11.5", - "rapids-dask-dependency==24.10.*,>=0.0.0a0", + "pynvml>=11.0.0,<12.0.0a0", + "rapids-dask-dependency==24.12.*,>=0.0.0a0", "zict>=2.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -50,12 +50,12 @@ docs = [ "sphinx-rtd-theme>=0.5.1", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. test = [ - "cudf==24.10.*,>=0.0.0a0", - "dask-cudf==24.10.*,>=0.0.0a0", - "kvikio==24.10.*,>=0.0.0a0", + "cudf==24.12.*,>=0.0.0a0", + "dask-cudf==24.12.*,>=0.0.0a0", + "kvikio==24.12.*,>=0.0.0a0", "pytest", "pytest-cov", - "ucx-py==0.40.*,>=0.0.0a0", + "ucx-py==0.41.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] @@ -128,6 +128,9 @@ filterwarnings = [ # is enabled in both dask-cudf and dask-cuda. # See: https://github.com/rapidsai/dask-cuda/issues/1311 "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning", + # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 + # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` + "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] [tool.rapids-build-backend] @@ -149,3 +152,11 @@ exclude = [ "docs.*", "tests.*", ] + +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# PyPI limit is 100 MiB, fail CI before we get too close to that +max_allowed_size_compressed = '75M'