diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index cf10aa23f..000000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-dask_cuda/_version.py export-subst
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9bfa630e1..be9daacfb 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,10 +1,14 @@
 #python code owners
 dask_cuda/  @rapidsai/daskcuda-python-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 56fed450a..69b0de5f5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6688d0ff7..4e56d24d2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2424729d7..7a884c5c6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 492c96f2c..335080816 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,8 +32,12 @@ repos:
                 additional_dependencies: [types-cachetools]
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
+      - repo: https://github.com/rapidsai/pre-commit-hooks
+        rev: v0.3.0
+        hooks:
+            - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.8.0
+        rev: v1.13.11
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ea704c1f..37c588511 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+# dask-cuda 24.08.00 (7 Aug 2024)
+
+## 🐛 Bug Fixes
+
+- Fix partitioning in explicit-comms shuffle ([#1356](https://github.com/rapidsai/dask-cuda/pull/1356)) [@rjzamora](https://github.com/rjzamora)
+- Update cuDF&#39;s `assert_eq` import ([#1353](https://github.com/rapidsai/dask-cuda/pull/1353)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Add arguments to enable cuDF spilling and set statistics ([#1362](https://github.com/rapidsai/dask-cuda/pull/1362)) [@pentschev](https://github.com/pentschev)
+- Allow disabling RMM in benchmarks ([#1352](https://github.com/rapidsai/dask-cuda/pull/1352)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- consolidate cuda_suffixed=false blocks in dependencies.yaml, fix update-version.sh ([#1367](https://github.com/rapidsai/dask-cuda/pull/1367)) [@jameslamb](https://github.com/jameslamb)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#1364](https://github.com/rapidsai/dask-cuda/pull/1364)) [@jameslamb](https://github.com/jameslamb)
+- Use verify-alpha-spec hook ([#1360](https://github.com/rapidsai/dask-cuda/pull/1360)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Use workflow branch 24.08 again ([#1359](https://github.com/rapidsai/dask-cuda/pull/1359)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Build and test with CUDA 12.5.1 ([#1357](https://github.com/rapidsai/dask-cuda/pull/1357)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop `setup.py` ([#1354](https://github.com/rapidsai/dask-cuda/pull/1354)) [@jakirkham](https://github.com/jakirkham)
+- remove .gitattributes ([#1350](https://github.com/rapidsai/dask-cuda/pull/1350)) [@jameslamb](https://github.com/jameslamb)
+- make conda recipe data-loading stricter ([#1349](https://github.com/rapidsai/dask-cuda/pull/1349)) [@jameslamb](https://github.com/jameslamb)
+- Adopt CI/packaging codeowners ([#1347](https://github.com/rapidsai/dask-cuda/pull/1347)) [@bdice](https://github.com/bdice)
+- Remove text builds of documentation ([#1346](https://github.com/rapidsai/dask-cuda/pull/1346)) [@vyasr](https://github.com/vyasr)
+- use rapids-build-backend ([#1343](https://github.com/rapidsai/dask-cuda/pull/1343)) [@jameslamb](https://github.com/jameslamb)
+
 # dask-cuda 24.06.00 (5 Jun 2024)
 
 ## 🐛 Bug Fixes
diff --git a/VERSION b/VERSION
index 0bff6981a..ec8489fda 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index a727d6daf..c2a65a414 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -7,7 +7,7 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
     --output conda \
-    --file_key docs \
+    --file-key docs \
     --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
@@ -23,16 +23,14 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.06"
+export RAPIDS_VERSION_NUMBER="24.08"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml ./source _html
-sphinx-build -b text ./source _text
-mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"{html,txt}
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/dask-cuda/txt"
 popd
 
 rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index e2429e98c..48cece328 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -13,18 +13,12 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-package_name="dask_cuda"
-
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+rapids-generate-version > ./VERSION
 
 rapids-logger "Begin py build"
 conda config --set path_conflict prevent
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9ec826733..828972dc2 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -6,22 +6,7 @@ set -euo pipefail
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-        alpha_spec=',>=0.0.0a0'
-fi
-
-sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
+rapids-generate-version > ./VERSION
 
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 9bc26fe71..f8bc16525 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n checks
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 0d1b8b1a5..a9fe1d02e 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -36,8 +36,8 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump testing dependencies
-sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
-sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
+sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
+sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
 
 DEPENDENCIES=(
   cudf
@@ -45,10 +45,23 @@ DEPENDENCIES=(
   kvikio
   rapids-dask-dependency
 )
-for FILE in dependencies.yaml conda/environments/*.yaml; do
-  for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+for DEP in "${DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" pyproject.toml
+done
+
+UCX_DEPENDENCIES=(
+  distributed-ucxx
+  ucx-py
+  ucxx
+)
+for DEP in "${UCX_DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+  done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0\"/g" pyproject.toml
 done
 
 # CI files
diff --git a/ci/test_python.sh b/ci/test_python.sh
index b52cbb6d4..78330a403 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
@@ -99,6 +99,59 @@ python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --runs 1 \
   --backend explicit-comms
 
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm-pool \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-async \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-managed \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
 rapids-logger "Run local benchmark (legacy dd)"
 DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index afdb516fa..c0fed8e57 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a25b7e7a7..d1f6933cd 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 65%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index ff2dea696..a27dea728 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -10,11 +10,11 @@ dependencies:
 - click >=8.1
 - cuda-nvcc-impl
 - cuda-nvrtc
-- cuda-version=12.2
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cuda-version=12.5
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 357e6dede..eba1a4fc0 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -21,17 +21,18 @@ build:
   script:
     - {{ PYTHON }} -m pip install . -vv
   entry_points:
-    {% for e in data.get("project", {}).get("scripts", {}).items() %}
-    - {{ e|join(" = ") }}
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint ~ ' = ' ~ data["project"]["scripts"][entrypoint] }}
     {% endfor %}
 
 requirements:
   host:
     - python
     - pip
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
-    {% for r in data.get("project", {}).get("dependencies", []) %}
+    {% for r in data["project"]["dependencies"] %}
     - {{ r }}
     {% endfor %}
 
@@ -40,18 +41,18 @@ test:
     - dask_cuda
   commands:
     - dask cuda --help
-    {% for e in data.get("project", {}).get("scripts", {}).keys() %}
-    - {{ e }} --help
-    - {{ e|replace("-", " ") }} --help
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint }} --help
+    - {{ entrypoint|replace("-", " ") }} --help
     {% endfor %}
 
 about:
-  home: {{ data.get("project", {}).get("urls", {}).get("Homepage", "") }}
-  license: {{ data.get("project", {}).get("license", {}).get("text", "") }}
+  home: {{ data["project"]["urls"]["Homepage"] }}
+  license: {{ data["project"]["license"]["text"] }}
   license_file:
-    {% for e in data.get("tool", {}).get("setuptools", {}).get("license-files", []) %}
+    {% for e in data["tool"]["setuptools"]["license-files"] %}
     - ../../../{{ e }}
     {% endfor %}
-  summary: {{ data.get("project", {}).get("description", "") }}
-  dev_url: {{ data.get("project", {}).get("urls", {}).get("Source", "") }}
-  doc_url: {{ data.get("project", {}).get("urls", {}).get("Documentation", "") }}
+  summary: {{ data["project"]["description"] }}
+  dev_url: {{ data["project"]["urls"]["Source"] }}
+  doc_url: {{ data["project"]["urls"]["Documentation"] }}
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
index c54072ba5..820bf10ba 100644
--- a/dask_cuda/_version.py
+++ b/dask_cuda/_version.py
@@ -15,6 +15,16 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 1335334ab..7f48d4fae 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -117,16 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
     wait_for_cluster(client, shutdown_on_failure=True)
     assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
-        client,
-        args.type == "gpu",
-        args.rmm_pool_size,
-        args.disable_rmm_pool,
-        args.enable_rmm_async,
-        args.enable_rmm_managed,
-        args.rmm_release_threshold,
-        args.rmm_log_directory,
-        args.enable_rmm_statistics,
-        args.enable_rmm_track_allocations,
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5ac79a88d..48e4755fb 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -17,6 +17,7 @@
 from distributed.comm.addressing import get_address_host
 
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
+from dask_cuda.utils import parse_device_memory_limit
 
 
 def as_noop(dsk):
@@ -93,15 +94,41 @@ def parse_benchmark_args(
         "'forkserver' can be used to avoid issues with fork not being allowed "
         "after the networking stack has been initialised.",
     )
+    cluster_args.add_argument(
+        "--disable-rmm",
+        action="store_true",
+        help="Disable RMM.",
+    )
+    cluster_args.add_argument(
+        "--disable-rmm-pool",
+        action="store_true",
+        help="Uses RMM for allocations but without a memory pool.",
+    )
     cluster_args.add_argument(
         "--rmm-pool-size",
         default=None,
         type=parse_bytes,
         help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
-        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
+        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
-        "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
+        "--rmm-maximum-pool-size",
+        default=None,
+        help="When ``--rmm-pool-size`` is specified, this argument indicates the "
+        "maximum pool size.  Can be an integer (bytes), or a string (like '4GB' or "
+        "'5000M'). By default, the total available memory on the GPU is used. "
+        "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
+        "pool size."
+        ""
+        ".. note::"
+        "    When paired with `--enable-rmm-async` the maximum size cannot be "
+        "    guaranteed due to fragmentation."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
         "--enable-rmm-managed",
@@ -407,10 +434,29 @@ def get_worker_device():
         return -1
 
 
+def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
+    import cupy
+
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+
+
 def setup_memory_pool(
     dask_worker=None,
+    disable_rmm=None,
+    disable_rmm_pool=None,
     pool_size=None,
-    disable_pool=False,
+    maximum_pool_size=None,
     rmm_async=False,
     rmm_managed=False,
     release_threshold=None,
@@ -418,45 +464,66 @@ def setup_memory_pool(
     statistics=False,
     rmm_track_allocations=False,
 ):
-    import cupy
-
     import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
 
     from dask_cuda.utils import get_rmm_log_file_name
 
     logging = log_directory is not None
 
-    if rmm_async:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=pool_size, release_threshold=release_threshold
-            )
-        )
-    else:
-        rmm.reinitialize(
-            pool_allocator=not disable_pool,
-            managed_memory=rmm_managed,
-            initial_pool_size=pool_size,
-            logging=logging,
-            log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
-        )
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    if statistics:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+    if pool_size is not None:
+        pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
+
+    if maximum_pool_size is not None:
+        maximum_pool_size = parse_device_memory_limit(
+            maximum_pool_size, alignment_size=256
         )
-    if rmm_track_allocations:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+
+    if release_threshold is not None:
+        release_threshold = parse_device_memory_limit(
+            release_threshold, alignment_size=256
         )
 
+    if not disable_rmm:
+        if rmm_async:
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size,
+                release_threshold=release_threshold,
+            )
+
+            if maximum_pool_size is not None:
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+        else:
+            rmm.reinitialize(
+                pool_allocator=not disable_rmm_pool,
+                managed_memory=rmm_managed,
+                initial_pool_size=pool_size,
+                maximum_pool_size=maximum_pool_size,
+                logging=logging,
+                log_file_name=get_rmm_log_file_name(
+                    dask_worker, logging, log_directory
+                ),
+            )
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+
 
 def setup_memory_pools(
     client,
     is_gpu,
+    disable_rmm,
+    disable_rmm_pool,
     pool_size,
-    disable_pool,
+    maximum_pool_size,
     rmm_async,
     rmm_managed,
     release_threshold,
@@ -468,8 +535,10 @@ def setup_memory_pools(
         return
     client.run(
         setup_memory_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
         pool_size=pool_size,
-        disable_pool=disable_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
@@ -482,7 +551,9 @@ def setup_memory_pools(
     client.run_on_scheduler(
         setup_memory_pool,
         pool_size=1e9,
-        disable_pool=disable_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index cc2d08437..6a3518e07 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -101,6 +101,20 @@ def cuda():
     total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
     disable spilling to host (i.e. allow full device memory usage).""",
 )
+@click.option(
+    "--enable-cudf-spill/--disable-cudf-spill",
+    default=False,
+    show_default=True,
+    help="""Enable automatic cuDF spilling. WARNING: This should NOT be used with
+    JIT-Unspill.""",
+)
+@click.option(
+    "--cudf-spill-stats",
+    type=int,
+    default=0,
+    help="""Set the cuDF spilling statistics level. This option has no effect if
+    `--enable-cudf-spill` is not specified.""",
+)
 @click.option(
     "--rmm-pool-size",
     default=None,
@@ -120,6 +134,10 @@ def cuda():
     memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
     to set the maximum pool size.
 
+    .. note::
+        When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
+        to fragmentation.
+
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
 )
@@ -326,6 +344,8 @@ def worker(
     name,
     memory_limit,
     device_memory_limit,
+    enable_cudf_spill,
+    cudf_spill_stats,
     rmm_pool_size,
     rmm_maximum_pool_size,
     rmm_managed_memory,
@@ -398,6 +418,8 @@ def worker(
             name,
             memory_limit,
             device_memory_limit,
+            enable_cudf_spill,
+            cudf_spill_stats,
             rmm_pool_size,
             rmm_maximum_pool_size,
             rmm_managed_memory,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index e25a7c142..b88c9bc98 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -20,7 +20,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -41,6 +41,8 @@ def __init__(
         name=None,
         memory_limit="auto",
         device_memory_limit="auto",
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         rmm_pool_size=None,
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
@@ -166,6 +168,12 @@ def del_pid_file():
         if device_memory_limit is None and memory_limit is None:
             data = lambda _: {}
         elif jit_unspill:
+            if enable_cudf_spill:
+                warnings.warn(
+                    "Enabling cuDF spilling and JIT-Unspill together is not "
+                    "safe, consider disabling JIT-Unspill."
+                )
+
             data = lambda i: (
                 ProxifyHostFile,
                 {
@@ -217,6 +225,7 @@ def del_pid_file():
                         track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
+                    CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
                 },
                 name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
                 local_directory=local_directory,
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 3f7b79514..70f123354 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -8,6 +8,9 @@
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
+import numpy as np
+import pandas as pd
+
 import dask
 import dask.config
 import dask.dataframe
@@ -155,9 +158,16 @@ def compute_map_index(
     if column_names[0] == "_partitions":
         ind = df[column_names[0]]
     else:
-        ind = hash_object_dispatch(
-            df[column_names] if column_names else df, index=False
-        )
+        # Need to cast numerical dtypes to be consistent
+        # with `dask.dataframe.shuffle.partitioning_index`
+        dtypes = {}
+        index = df[column_names] if column_names else df
+        for col, dtype in index.dtypes.items():
+            if pd.api.types.is_numeric_dtype(dtype):
+                dtypes[col] = np.float64
+        if dtypes:
+            index = index.astype(dtypes, errors="ignore")
+        ind = hash_object_dispatch(index, index=False)
     return ind % npartitions
 
 
@@ -187,15 +197,8 @@ def partition_dataframe(
     partitions
         Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
-        return dict(
-            zip(
-                range(npartitions),
-                df.partition_by_hash(
-                    column_names, npartitions, keep_index=not ignore_index
-                ),
-            )
-        )
+    # TODO: Use `partition_by_hash` if/when dtype-casting is added
+    # (See: https://github.com/rapidsai/cudf/issues/16221)
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 
@@ -529,18 +532,19 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
 
-    futures = []
+    _futures = {}
     for rank in ranks:
         for part_id in rank_to_out_part_ids[rank]:
-            futures.append(
-                c.client.submit(
-                    getitem,
-                    shuffle_result[rank],
-                    part_id,
-                    workers=[c.worker_addresses[rank]],
-                )
+            _futures[part_id] = c.client.submit(
+                getitem,
+                shuffle_result[rank],
+                part_id,
+                workers=[c.worker_addresses[rank]],
             )
 
+    # Make sure partitions are properly ordered
+    futures = [_futures.pop(i) for i in range(npartitions)]
+
     # Create a distributed Dataframe from all the pieces
     divs = [None] * (len(futures) + 1)
     kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 7a5c8c13d..202373e9d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -10,7 +10,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -73,6 +73,14 @@ class LocalCUDACluster(LocalCluster):
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
         total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
         or ``None`` to disable spilling to host (i.e. allow full device memory usage).
+    enable_cudf_spill : bool, default False
+        Enable automatic cuDF spilling.
+
+        .. warning::
+            This should NOT be used together with JIT-Unspill.
+    cudf_spill_stats : int, default 0
+        Set the cuDF spilling statistics level. This option has no effect if
+        ``enable_cudf_spill=False``.
     local_directory : str or None, default None
         Path on local machine to store temporary files. Can be a string (like
         ``"path/to/files"``) or ``None`` to fall back on the value of
@@ -114,6 +122,10 @@ class LocalCUDACluster(LocalCluster):
         memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
         and to set the maximum pool size.
 
+        .. note::
+            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
+            due to fragmentation.
+
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_managed_memory : bool, default False
@@ -205,6 +217,8 @@ def __init__(
         threads_per_worker=1,
         memory_limit="auto",
         device_memory_limit=0.8,
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         data=None,
         local_directory=None,
         shared_filesystem=None,
@@ -255,6 +269,8 @@ def __init__(
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )
+        self.enable_cudf_spill = enable_cudf_spill
+        self.cudf_spill_stats = cudf_spill_stats
 
         self.rmm_pool_size = rmm_pool_size
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
@@ -298,6 +314,12 @@ def __init__(
             if device_memory_limit is None and memory_limit is None:
                 data = {}
             elif jit_unspill:
+                if enable_cudf_spill:
+                    warnings.warn(
+                        "Enabling cuDF spilling and JIT-Unspill together is not "
+                        "safe, consider disabling JIT-Unspill."
+                    )
+
                 data = (
                     ProxifyHostFile,
                     {
@@ -410,6 +432,7 @@ def new_worker_spec(self):
                         track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
+                    CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
                 },
             }
         )
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
index 4eba97f2b..122f93ffa 100644
--- a/dask_cuda/plugins.py
+++ b/dask_cuda/plugins.py
@@ -14,6 +14,21 @@ def setup(self, worker=None):
         os.sched_setaffinity(0, self.cores)
 
 
+class CUDFSetup(WorkerPlugin):
+    def __init__(self, spill, spill_stats):
+        self.spill = spill
+        self.spill_stats = spill_stats
+
+    def setup(self, worker=None):
+        try:
+            import cudf
+
+            cudf.set_option("spill", self.spill)
+            cudf.set_option("spill_stats", self.spill_stats)
+        except ImportError:
+            pass
+
+
 class RMMSetup(WorkerPlugin):
     def __init__(
         self,
diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index d4c28ba06..80b1d482d 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -20,7 +20,7 @@
     get_global_manager,
     set_global_manager,
 )
-from cudf.testing._utils import assert_eq  # noqa: E402
+from cudf.testing import assert_eq  # noqa: E402
 
 if get_global_manager() is not None:
     pytest.skip(
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 974ad1319..505af12f1 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -231,6 +231,64 @@ def test_rmm_logging(loop):  # noqa: F811
                     assert v is rmm.mr.LoggingResourceAdaptor
 
 
+def test_cudf_spill_disabled(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(
+                    cudf.get_option,
+                    "spill",
+                )
+                for v in cudf_spill.values():
+                    assert v is False
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 0
+
+
+def test_cudf_spill(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+                "--enable-cudf-spill",
+                "--cudf-spill-stats",
+                "2",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(cudf.get_option, "spill")
+                for v in cudf_spill.values():
+                    assert v is True
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 2
+
+
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index f495648e0..2806dc1cd 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -109,7 +109,14 @@ def test_dataframe_merge_empty_partitions():
 
 def check_partitions(df, npartitions):
     """Check that all values in `df` hashes to the same"""
-    hashes = partitioning_index(df, npartitions)
+    dtypes = {}
+    for col, dtype in df.dtypes.items():
+        if pd.api.types.is_numeric_dtype(dtype):
+            dtypes[col] = np.float64
+    if not dtypes:
+        dtypes = None
+
+    hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
     if len(hashes) > 0:
         return len(hashes.unique()) == 1
     else:
@@ -128,11 +135,10 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
-        with Client(cluster) as client:
-            all_workers = list(client.get_worker_logs().keys())
+        with Client(cluster):
             comms.default_comms()
             np.random.seed(42)
-            df = pd.DataFrame({"key": np.random.random(100)})
+            df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
 
@@ -141,15 +147,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
-                    ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
-                        workers=all_workers
-                    )
+                    ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
                     # To reduce test runtime, we change the batchsizes here instead
                     # of using a test parameter.
                     for batchsize in (-1, 1, 2):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
-                                ddf,
+                                ddf1,
                                 ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
@@ -177,6 +181,32 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
                                 got = ddf.compute().sort_values("key")
                                 assert_eq(got, expected)
 
+                                # Check that partitioning is consistent with "tasks"
+                                ddf_tasks = ddf1.shuffle(
+                                    ["key"],
+                                    npartitions=output_nparts,
+                                    shuffle_method="tasks",
+                                )
+                                for i in range(output_nparts):
+                                    expected_partition = ddf_tasks.partitions[
+                                        i
+                                    ].compute()["key"]
+                                    actual_partition = ddf.partitions[i].compute()[
+                                        "key"
+                                    ]
+                                    if backend == "cudf":
+                                        expected_partition = (
+                                            expected_partition.values_host
+                                        )
+                                        actual_partition = actual_partition.values_host
+                                    else:
+                                        expected_partition = expected_partition.values
+                                        actual_partition = actual_partition.values
+                                    assert all(
+                                        np.sort(expected_partition)
+                                        == np.sort(actual_partition)
+                                    )
+
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b05389e4c..b144d1114 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -500,6 +500,54 @@ async def test_worker_fraction_limits():
             )
 
 
+@gen_test(timeout=20)
+async def test_cudf_spill_disabled():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is False
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 0
+
+
+@gen_test(timeout=20)
+async def test_cudf_spill():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        enable_cudf_spill=True,
+        cudf_spill_stats=2,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is True
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 2
+
+
 @pytest.mark.parametrize(
     "protocol",
     ["ucx", "ucxx"],
diff --git a/dask_cuda/tests/test_version.py b/dask_cuda/tests/test_version.py
new file mode 100644
index 000000000..f30b2847d
--- /dev/null
+++ b/dask_cuda/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import dask_cuda
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cuda.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(dask_cuda.__version__, str)
+    assert len(dask_cuda.__version__) > 0
diff --git a/dependencies.yaml b/dependencies.yaml
index 20c6ca05e..c3b629654 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.4", "11.8", "12.2"]
+      cuda: ["11.4", "11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_python
@@ -74,6 +74,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - rapids-build-backend>=0.3.0,<0.4.0dev0
           - setuptools>=64.0.0
   cuda_version:
     specific:
@@ -99,6 +100,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
@@ -153,23 +158,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.6.*
+          - rapids-dask-dependency==24.8.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.6.*
-          - dask-cudf==24.6.*
-          - kvikio==24.6.*
           - pytest
           - pytest-cov
-          - ucx-py==0.38.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.38.*
+          - &cudf_unsuffixed cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.8.*,>=0.0.0a0
+          - distributed-ucxx==0.39.*,>=0.0.0a0
+          - &kvikio_unsuffixed kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.38.*
+          - ucxx==0.39.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -181,3 +186,27 @@ dependencies:
               arch: aarch64
             packages:
               - numactl-devel-cos7-aarch64
+      - output_types: [requirements, pyproject]
+        matrices:
+          # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
+          # ref: https://github.com/rapidsai/kvikio/pull/369
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - cudf-cu12==24.8.*,>=0.0.0a0
+              - dask-cudf-cu12==24.8.*,>=0.0.0a0
+              - ucx-py-cu12==0.39.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - cudf-cu11==24.8.*,>=0.0.0a0
+              - dask-cudf-cu11==24.8.*,>=0.0.0a0
+              - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *cudf_unsuffixed
+              - *dask_cudf_unsuffixed
+              - *kvikio_unsuffixed
+              - *ucx_py_unsuffixed
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index aecbc1fd9..9fde8756a 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.06/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.08/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/docs/source/install.rst b/docs/source/install.rst
index e522ae3c1..43082a671 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 12.0:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 12.5:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.0
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.5
 
 Pip
 ---
diff --git a/pyproject.toml b/pyproject.toml
index 35d485381..b6c431d61 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,7 @@
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0dev0",
     "setuptools>=64.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -20,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.6.*",
+    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.6.*",
-    "dask-cudf==24.6.*",
-    "kvikio==24.6.*",
+    "cudf==24.8.*,>=0.0.0a0",
+    "dask-cudf==24.8.*,>=0.0.0a0",
+    "kvikio==24.8.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.38.*",
+    "ucx-py==0.39.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -129,6 +130,12 @@ filterwarnings = [
     "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+dependencies-file = "dependencies.yaml"
+disable-cuda = true
+matrix-entry = "cuda_suffixed=true"
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 606849326..000000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from setuptools import setup
-
-setup()