diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index de81e1e8..49676fee 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -1,3 +1,4 @@
+import contextlib
 from argparse import Namespace
 from functools import partial
 from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
@@ -7,7 +8,7 @@
 import pandas as pd
 
 import dask
-from distributed import Client
+from distributed import Client, performance_report
 
 from dask_cuda.benchmarks.utils import (
     address_to_index,
@@ -87,14 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config):
 
     If ``args.profile`` is set, the final run is profiled.
     """
+
     results = []
     for _ in range(max(0, args.warmup_runs)):
         config.bench_once(client, args, write_profile=None)
-    for _ in range(max(1, args.runs) - 1):
-        res = config.bench_once(client, args, write_profile=None)
-        results.append(res)
-    results.append(config.bench_once(client, args, write_profile=args.profile))
-    return results
+
+    ctx = contextlib.nullcontext()
+    if args.profile is not None:
+        ctx = performance_report(filename=args.profile)
+    with ctx:
+        for _ in range(max(1, args.runs) - 1):
+            res = config.bench_once(client, args, write_profile=None)
+            results.append(res)
+        results.append(config.bench_once(client, args, write_profile=args.profile_last))
+        return results
 
 
 def gather_bench_results(client: Client, args: Namespace, config: Config):
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index eb402748..a9e7d833 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None):
         "False": False,
     }.get(args.shuffle, args.shuffle)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         t1 = clock()
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index 96d9d7f5..6ebe005a 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None):
     if args.backend == "explicit-comms":
         ctx1 = dask.config.set(explicit_comms=True)
     if write_profile is not None:
-        ctx2 = performance_report(filename=args.profile)
+        ctx2 = performance_report(filename=write_profile)
 
     with ctx1:
         with ctx2:
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 9f7c7e1d..3a0955c4 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -121,10 +121,9 @@ def create_data(
 def bench_once(client, args, write_profile=None):
     data_processed, df = create_data(client, args)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         if args.backend in {"dask", "dask-noop"}:
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index 4785ad4f..ba88db30 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None):
     chunksize = x.chunksize
     data_processed = sum(arg.nbytes for arg in func_args)
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         rng = start_range(message=args.operation, color="purple")
         result = func(*func_args)
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index f0718ed1..ecefa52a 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None):
 
     data_processed = x.nbytes
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         result = x.map_overlap(mean_filter, args.kernel_size, shape=ks)
         if args.backend == "dask-noop":
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 94f4f352..4f87a025 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -323,7 +323,16 @@ def parse_benchmark_args(
         metavar="PATH",
         default=None,
         type=str,
-        help="Write dask profile report (E.g. dask-report.html)",
+        help="Write dask profile report (E.g. dask-report.html) on all "
+        "iterations (excluding warmup).",
+    )
+    parser.add_argument(
+        "--profile-last",
+        metavar="PATH",
+        default=None,
+        type=str,
+        help="Write dask profile report (E.g. dask-report.html) on last "
+        "iteration only.",
     )
     # See save_benchmark_data for more information
     parser.add_argument(