Merge remote-tracking branch 'upstream/dev' into retrieve

NVIDIA · Jul 11, 2024 · 596cd56 · 596cd56
2 parents 0c25968 + 5c70615
commit 596cd56
Show file tree

Hide file tree

Showing 59 changed files with 1,647 additions and 881 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ ci:
 
 repos:
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.6
+        rev: v18.1.8
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]

diff --git a/README.md b/README.md
@@ -204,28 +204,28 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/fczxbM1h6))
-- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/zcnGqfdMW))
-- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/z/P9s4fKscj))
-- [Using shared memory as storage](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/shared_memory_example.cu) (see [live example in godbolt](https://godbolt.org/z/4dq58efc8))
-- [Using set as mapping table to handle large keys or indeterministic sentinels](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/mapping_table_example.cu) (see [live example in godbolt](https://godbolt.org/z/E1e3j86E4))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/96re4zhjo))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/7aKWdGTfx))
+- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/z/sMfqGxdha))
+- [Using shared memory as storage](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/shared_memory_example.cu) (see [live example in godbolt](https://godbolt.org/z/zdTnbE1q5))
+- [Using set as mapping table to handle large keys or indeterministic sentinels](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/mapping_table_example.cu) (see [live example in godbolt](https://godbolt.org/z/KfYo4nMss))
 
 ### `static_map`
 
 `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/n3x8s19hP))
-- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/K4oWe46PT))
-- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/nbb8Psd8z))
-- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/P8TW5194T))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ndccPsj11))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/e9j5o6f5M))
+- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/xx74Knn3r))
+- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/fz7q1vav6))
 
 ### `static_multimap`
 
 `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/nx97W4f7q))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/aq8jz8G9e))
 
 ### `dynamic_map`
 
@@ -239,7 +239,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ahjEoWM1E))
-- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/qebYY8Goj))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/sMfofM6qd))
+- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/156T9ox7h))
 
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH
   hash_table/static_map/insert_bench.cu
   hash_table/static_map/find_bench.cu
   hash_table/static_map/contains_bench.cu
-  hash_table/static_map/erase_bench.cu)
+  hash_table/static_map/erase_bench.cu
+  hash_table/static_map/insert_or_apply_bench.cu)
 
 ###################################################################################################
 # - static_multiset benchmarks --------------------------------------------------------------------

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+
+  std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, static_cast<Value>(key));
+  });
+
+  state.add_element_count(num_keys);
+
+  cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    map.clear_async({launch.get_stream()});
+
+    timer.start();
+    map.insert_or_apply_async(
+      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu
@@ -66,9 +66,7 @@ int main(void)
   auto pairs_begin = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
     cuda::proclaim_return_type<cuco::pair<custom_key_type, custom_value_type>>(
-      [] __device__(auto i) {
-        return cuco::pair{custom_key_type{i}, custom_value_type{i}};
-      }));
+      [] __device__(auto i) { return cuco::pair{custom_key_type{i}, custom_value_type{i}}; }));
 
   // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the
   // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%.

diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/sequence.h>
 #include <thrust/transform.h>
 
@@ -40,14 +39,15 @@ int main(void)
   cuco::static_multimap<key_type, value_type> map{
     N * 2, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
 
-  thrust::device_vector<thrust::pair<key_type, value_type>> pairs(N);
+  thrust::device_vector<cuco::pair<key_type, value_type>> pairs(N);
 
   // Create a sequence of pairs. Eeach key has two matches.
   // E.g., {{0,0}, {1,1}, ... {0,25'000}, {1, 25'001}, ...}
-  thrust::transform(thrust::make_counting_iterator<int>(0),
-                    thrust::make_counting_iterator<int>(pairs.size()),
-                    pairs.begin(),
-                    [] __device__(auto i) { return thrust::make_pair(i % (N / 2), i); });
+  thrust::transform(
+    thrust::make_counting_iterator<int>(0),
+    thrust::make_counting_iterator<int>(pairs.size()),
+    pairs.begin(),
+    [] __device__(auto i) { return cuco::pair<key_type, value_type>{i % (N / 2), i}; });
 
   // Inserts all pairs into the map
   map.insert(pairs.begin(), pairs.end());

diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
@@ -58,10 +58,10 @@ using storage_type = cuco::aow_storage<key_type, window_size>;
 /// Lightweight non-owning storage ref type
 using storage_ref_type = typename storage_type::ref_type;
 using ref_type         = cuco::static_set_ref<key_type,
-                                      cuda::thread_scope_device,
-                                      thrust::equal_to<key_type>,
-                                      probing_scheme_type,
-                                      storage_ref_type>;  ///< Set ref type
+                                              cuda::thread_scope_device,
+                                              thrust::equal_to<key_type>,
+                                              probing_scheme_type,
+                                              storage_ref_type>;  ///< Set ref type
 
 /// Sample data to insert and query
 __device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};

diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/storage/aow_storage_base.cuh>
 #include <cuco/extent.cuh>
 #include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
+#include <cuda/stream_ref>
 
 #include <cstddef>
 #include <cstdint>
@@ -122,15 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize(value_type key, cuda_stream_ref stream = {});
+  void initialize(value_type key, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
    *
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
+  void initialize_async(value_type key, cuda::stream_ref stream = {}) noexcept;
 
  private:
   allocator_type allocator_;            ///< Allocator used to (de)allocate windows

diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp