Move host-bulk find details to OA base class

NVIDIA · May 15, 2024 · 06c0688 · 06c0688
1 parent fb3d490
commit 06c0688
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 155 deletions.
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
@@ -18,15 +18,14 @@
 #include <cuco/detail/utility/cuda.cuh>
 
 #include <cub/block/block_reduce.cuh>
-
 #include <cuda/atomic>
+#include <cuda/functional>
 
 #include <cooperative_groups.h>
 
 #include <iterator>
 
-namespace cuco {
-namespace detail {
+namespace cuco::detail {
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
@@ -255,6 +254,81 @@ CUCO_KERNEL void contains_if_n(InputIt first,
   }
 }
 
+/**
+ * @brief Finds the equivalent container elements of all keys in the range `[first, first + n)`.
+ *
+ * @note If the key `*(first + i)` has a match in the container, copies the match to `(output_begin
+ * + i)`. Else, copies the empty sentinel. Uses the CUDA Cooperative Groups API to leverage groups
+ * of multiple threads to find each key. This provides a significant boost in throughput compared to
+ * the non Cooperative Group `find` at moderate to high load factors.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys to query
+ * @param output_begin Beginning of the sequence of matched payloads retrieved for each key
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
+CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block       = cg::this_thread_block();
+  auto const thread_idx  = block.thread_rank();
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  using output_type = typename std::iterator_traits<OutputIt>::value_type;
+  __shared__ output_type output_buffer[BlockSize / CGSize];
+
+  auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;
+
+  auto const sentinel = [&]() {
+    if constexpr (has_payload) {
+      return ref.empty_value_sentinel();
+    } else {
+      return ref.empty_key_sentinel();
+    }
+  }();
+
+  auto output = cuda::proclaim_return_type<output_type>([&] __device__(auto found) {
+    if constexpr (has_payload) {
+      return found == ref.end() ? sentinel : found->second;
+    } else {
+      return found == ref.end() ? sentinel : *found;
+    }
+  });
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if (idx < n) {
+      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      if constexpr (CGSize == 1) {
+        auto const found = ref.find(key);
+        /*
+         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
+         * sector stores from L2 to global memory. By writing results to shared memory and then
+         * synchronizing before writing back to global, we no longer rely on L1, preventing the
+         * increase in sector stores from L2 to global and improving performance.
+         */
+        output_buffer[thread_idx] = output(found);
+        block.sync();
+        *(output_begin + idx) = output_buffer[thread_idx];
+      } else {
+        auto const tile  = cg::tiled_partition<CGSize>(block);
+        auto const found = ref.find(tile, key);
+
+        if (tile.thread_rank() == 0) { *(output_begin + idx) = output(found); }
+      }
+    }
+    idx += loop_stride;
+  }
+}
+
 /**
  * @brief Calculates the number of filled slots for the given window storage.
  *
@@ -340,5 +414,4 @@ CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref,
   }
 }
 
-}  // namespace detail
-}  // namespace cuco
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -498,6 +498,37 @@ class open_addressing_impl {
         first, num_keys, stencil, pred, output_begin, container_ref);
   }
 
+  /**
+   * @brief For all keys in the range `[first, last)`, asynchronously finds
+   * a match with its key equivalent to the query key.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of matches retrieved for each key
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt, typename Ref>
+  void find_async(InputIt first,
+                  InputIt last,
+                  OutputIt output_begin,
+                  Ref container_ref,
+                  cuda_stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::find<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, output_begin, container_ref);
+  }
+
   /**
    * @brief Retrieves all keys contained in the container.
    *

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
@@ -25,9 +25,7 @@
 
 #include <cooperative_groups.h>
 
-namespace cuco {
-namespace static_map_ns {
-namespace detail {
+namespace cuco::static_map_ns::detail {
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
@@ -67,66 +65,4 @@ CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
   }
 }
 
-/**
- * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`.
- *
- * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched
- * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative
- * Groups API to leverage groups of multiple threads to find each key. This provides a significant
- * boost in throughput compared to the non Cooperative Group `find` at moderate to high load
- * factors.
- *
- * @tparam CGSize Number of threads in each CG
- * @tparam BlockSize The size of the thread block
- * @tparam InputIt Device accessible input iterator
- * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
- * @tparam Ref Type of non-owning device ref allowing access to storage
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of matched payloads retrieved for each key
- * @param ref Non-owning map device ref used to access the slot storage
- */
-template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
-{
-  namespace cg = cooperative_groups;
-
-  auto const block       = cg::this_thread_block();
-  auto const thread_idx  = block.thread_rank();
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
-
-  __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize];
-
-  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
-    if (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      if constexpr (CGSize == 1) {
-        auto const found = ref.find(key);
-        /*
-         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
-         * sector stores from L2 to global memory. By writing results to shared memory and then
-         * synchronizing before writing back to global, we no longer rely on L1, preventing the
-         * increase in sector stores from L2 to global and improving performance.
-         */
-        output_buffer[thread_idx] =
-          found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
-        block.sync();
-        *(output_begin + idx) = output_buffer[thread_idx];
-      } else {
-        auto const tile  = cg::tiled_partition<CGSize>(block);
-        auto const found = ref.find(tile, key);
-
-        if (tile.thread_rank() == 0) {
-          *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
-        }
-      }
-    }
-    idx += loop_stride;
-  }
-}
-
-}  // namespace detail
-}  // namespace static_map_ns
-}  // namespace cuco
+}  // namespace cuco::static_map_ns::detail
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
@@ -374,14 +374,7 @@ template <typename InputIt, typename OutputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
   InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
 {
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
-
-  static_map_ns::detail::find<cg_size, cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, ref(op::find));
+  impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
 
 template <class Key,

diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh
@@ -20,75 +20,15 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_scan.cuh>
-
 #include <cuda/atomic>
 
 #include <cooperative_groups.h>
 
 #include <iterator>
 
-namespace cuco {
-namespace static_set_ns {
-namespace detail {
+namespace cuco::static_set_ns::detail {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
-/**
- * @brief Finds the equivalent set elements of all keys in the range `[first, last)`.
- *
- * If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin +
- * i)`. Else, copies the empty key sentinel. Uses the CUDA Cooperative Groups API to leverage groups
- * of multiple threads to find each key. This provides a significant boost in throughput compared to
- * the non Cooperative Group `find` at moderate to high load factors.
- *
- * @tparam CGSize Number of threads in each CG
- * @tparam BlockSize The size of the thread block
- * @tparam InputIt Device accessible input iterator
- * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type`
- * @tparam Ref Type of non-owning device ref allowing access to storage
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of matched elements retrieved for each key
- * @param ref Non-owning set device ref used to access the slot storage
- */
-template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
-{
-  namespace cg = cooperative_groups;
-
-  auto const block       = cg::this_thread_block();
-  auto const thread_idx  = block.thread_rank();
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
-
-  __shared__ typename Ref::key_type output_buffer[BlockSize / CGSize];
-
-  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
-    if (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      if constexpr (CGSize == 1) {
-        auto const found = ref.find(key);
-        /*
-         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
-         * sector stores from L2 to global memory. By writing results to shared memory and then
-         * synchronizing before writing back to global, we no longer rely on L1, preventing the
-         * increase in sector stores from L2 to global and improving performance.
-         */
-        output_buffer[thread_idx] = found == ref.end() ? ref.empty_key_sentinel() : *found;
-        block.sync();
-        *(output_begin + idx) = output_buffer[thread_idx];
-      } else {
-        auto const tile  = cg::tiled_partition<CGSize>(block);
-        auto const found = ref.find(tile, key);
-
-        if (tile.thread_rank() == 0) {
-          *(output_begin + idx) = found == ref.end() ? ref.empty_key_sentinel() : *found;
-        }
-      }
-    }
-    idx += loop_stride;
-  }
-}
 
 template <typename CG,
           typename Size,
@@ -342,6 +282,4 @@ CUCO_KERNEL void retrieve(InputIt first,
   }
 }
 
-}  // namespace detail
-}  // namespace static_set_ns
-}  // namespace cuco
+}  // namespace cuco::static_set_ns::detail
diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
@@ -297,14 +297,7 @@ template <typename InputIt, typename OutputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
   InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
 {
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
-
-  static_set_ns::detail::find<cg_size, cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, ref(op::find));
+  impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
 
 template <class Key,

diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
@@ -569,8 +569,7 @@ class static_map {
    *
    * @note This function synchronizes the given stream. For asynchronous execution use `find_async`.
    * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
-   * `element` to
-   * `(output_begin + i)`. Else, copies the empty value sentinel.
+   * `element` to `(output_begin + i)`. Else, copies the empty value sentinel.
    *
    * @tparam InputIt Device accessible input iterator
    * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
@@ -588,8 +587,7 @@ class static_map {
    * equivalent to the query key.
    *
    * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
-   * `element` to
-   * `(output_begin + i)`. Else, copies the empty value sentinel.
+   * `element` to `(output_begin + i)`. Else, copies the empty value sentinel.
    *
    * @tparam InputIt Device accessible input iterator
    * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`