diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py index de81e1e8..49676fee 100644 --- a/dask_cuda/benchmarks/common.py +++ b/dask_cuda/benchmarks/common.py @@ -1,3 +1,4 @@ +import contextlib from argparse import Namespace from functools import partial from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple @@ -7,7 +8,7 @@ import pandas as pd import dask -from distributed import Client +from distributed import Client, performance_report from dask_cuda.benchmarks.utils import ( address_to_index, @@ -87,14 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config): If ``args.profile`` is set, the final run is profiled. """ + results = [] for _ in range(max(0, args.warmup_runs)): config.bench_once(client, args, write_profile=None) - for _ in range(max(1, args.runs) - 1): - res = config.bench_once(client, args, write_profile=None) - results.append(res) - results.append(config.bench_once(client, args, write_profile=args.profile)) - return results + + ctx = contextlib.nullcontext() + if args.profile is not None: + ctx = performance_report(filename=args.profile) + with ctx: + for _ in range(max(1, args.runs) - 1): + res = config.bench_once(client, args, write_profile=None) + results.append(res) + results.append(config.bench_once(client, args, write_profile=args.profile_last)) + return results def gather_bench_results(client: Client, args: Namespace, config: Config): diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py index eb402748..a9e7d833 100644 --- a/dask_cuda/benchmarks/local_cudf_groupby.py +++ b/dask_cuda/benchmarks/local_cudf_groupby.py @@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None): "False": False, }.get(args.shuffle, args.shuffle) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: t1 = clock() diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py index 96d9d7f5..6ebe005a 100644 --- a/dask_cuda/benchmarks/local_cudf_merge.py +++ b/dask_cuda/benchmarks/local_cudf_merge.py @@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None): if args.backend == "explicit-comms": ctx1 = dask.config.set(explicit_comms=True) if write_profile is not None: - ctx2 = performance_report(filename=args.profile) + ctx2 = performance_report(filename=write_profile) with ctx1: with ctx2: diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py index 9f7c7e1d..3a0955c4 100644 --- a/dask_cuda/benchmarks/local_cudf_shuffle.py +++ b/dask_cuda/benchmarks/local_cudf_shuffle.py @@ -121,10 +121,9 @@ def create_data( def bench_once(client, args, write_profile=None): data_processed, df = create_data(client, args) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: if args.backend in {"dask", "dask-noop"}: diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py index 4785ad4f..ba88db30 100644 --- a/dask_cuda/benchmarks/local_cupy.py +++ b/dask_cuda/benchmarks/local_cupy.py @@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None): chunksize = x.chunksize data_processed = sum(arg.nbytes for arg in func_args) - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: rng = start_range(message=args.operation, color="purple") result = func(*func_args) diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py index f0718ed1..ecefa52a 100644 --- a/dask_cuda/benchmarks/local_cupy_map_overlap.py +++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py @@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None): data_processed = x.nbytes - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: result = x.map_overlap(mean_filter, args.kernel_size, shape=ks) if args.backend == "dask-noop": diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py index 94f4f352..4f87a025 100644 --- a/dask_cuda/benchmarks/utils.py +++ b/dask_cuda/benchmarks/utils.py @@ -323,7 +323,16 @@ def parse_benchmark_args( metavar="PATH", default=None, type=str, - help="Write dask profile report (E.g. dask-report.html)", + help="Write dask profile report (E.g. dask-report.html) on all " + "iterations (excluding warmup).", + ) + parser.add_argument( + "--profile-last", + metavar="PATH", + default=None, + type=str, + help="Write dask profile report (E.g. dask-report.html) on last " + "iteration only.", ) # See save_benchmark_data for more information parser.add_argument(