Skip to content

Commit

Permalink
Move host-bulk find details to OA base class
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel committed May 15, 2024
1 parent fb3d490 commit 06c0688
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 155 deletions.
83 changes: 78 additions & 5 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
#include <cuco/detail/utility/cuda.cuh>

#include <cub/block/block_reduce.cuh>

#include <cuda/atomic>
#include <cuda/functional>

#include <cooperative_groups.h>

#include <iterator>

namespace cuco {
namespace detail {
namespace cuco::detail {
CUCO_SUPPRESS_KERNEL_WARNINGS

/**
Expand Down Expand Up @@ -255,6 +254,81 @@ CUCO_KERNEL void contains_if_n(InputIt first,
}
}

/**
* @brief Finds the equivalent container elements of all keys in the range `[first, first + n)`.
*
* @note If the key `*(first + i)` has a match in the container, copies the match to `(output_begin
* + i)`. Else, copies the empty sentinel. Uses the CUDA Cooperative Groups API to leverage groups
* of multiple threads to find each key. This provides a significant boost in throughput compared to
* the non Cooperative Group `find` at moderate to high load factors.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize The size of the thread block
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param n Number of keys to query
* @param output_begin Beginning of the sequence of matched payloads retrieved for each key
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

using output_type = typename std::iterator_traits<OutputIt>::value_type;
__shared__ output_type output_buffer[BlockSize / CGSize];

auto constexpr has_payload = not std::is_same_v<typename Ref::key_type, typename Ref::value_type>;

auto const sentinel = [&]() {
if constexpr (has_payload) {
return ref.empty_value_sentinel();
} else {
return ref.empty_key_sentinel();
}
}();

auto output = cuda::proclaim_return_type<output_type>([&] __device__(auto found) {
if constexpr (has_payload) {
return found == ref.end() ? sentinel : found->second;
} else {
return found == ref.end() ? sentinel : *found;
}
});

while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
if (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
if constexpr (CGSize == 1) {
auto const found = ref.find(key);
/*
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
* sector stores from L2 to global memory. By writing results to shared memory and then
* synchronizing before writing back to global, we no longer rely on L1, preventing the
* increase in sector stores from L2 to global and improving performance.
*/
output_buffer[thread_idx] = output(found);
block.sync();
*(output_begin + idx) = output_buffer[thread_idx];
} else {
auto const tile = cg::tiled_partition<CGSize>(block);
auto const found = ref.find(tile, key);

if (tile.thread_rank() == 0) { *(output_begin + idx) = output(found); }
}
}
idx += loop_stride;
}
}

/**
* @brief Calculates the number of filled slots for the given window storage.
*
Expand Down Expand Up @@ -340,5 +414,4 @@ CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref,
}
}

} // namespace detail
} // namespace cuco
} // namespace cuco::detail
31 changes: 31 additions & 0 deletions include/cuco/detail/open_addressing/open_addressing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,37 @@ class open_addressing_impl {
first, num_keys, stencil, pred, output_begin, container_ref);
}

/**
* @brief For all keys in the range `[first, last)`, asynchronously finds
* a match with its key equivalent to the query key.
*
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator
* @tparam Ref Type of non-owning device container ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param last End of the sequence of keys
* @param output_begin Beginning of the sequence of matches retrieved for each key
* @param container_ref Non-owning device container ref used to access the slot storage
* @param stream Stream used for executing the kernels
*/
template <typename InputIt, typename OutputIt, typename Ref>
void find_async(InputIt first,
InputIt last,
OutputIt output_begin,
Ref container_ref,
cuda_stream_ref stream) const noexcept
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

detail::find<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, container_ref);
}

/**
* @brief Retrieves all keys contained in the container.
*
Expand Down
68 changes: 2 additions & 66 deletions include/cuco/detail/static_map/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@

#include <cooperative_groups.h>

namespace cuco {
namespace static_map_ns {
namespace detail {
namespace cuco::static_map_ns::detail {
CUCO_SUPPRESS_KERNEL_WARNINGS

/**
Expand Down Expand Up @@ -67,66 +65,4 @@ CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
}
}

/**
* @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`.
*
* @note If the key `*(first + i)` has a match in the container, copies the payload of its matched
* element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative
* Groups API to leverage groups of multiple threads to find each key. This provides a significant
* boost in throughput compared to the non Cooperative Group `find` at moderate to high load
* factors.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize The size of the thread block
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param n Number of keys to query
* @param output_begin Beginning of the sequence of matched payloads retrieved for each key
* @param ref Non-owning map device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

__shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize];

while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
if (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
if constexpr (CGSize == 1) {
auto const found = ref.find(key);
/*
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
* sector stores from L2 to global memory. By writing results to shared memory and then
* synchronizing before writing back to global, we no longer rely on L1, preventing the
* increase in sector stores from L2 to global and improving performance.
*/
output_buffer[thread_idx] =
found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
block.sync();
*(output_begin + idx) = output_buffer[thread_idx];
} else {
auto const tile = cg::tiled_partition<CGSize>(block);
auto const found = ref.find(tile, key);

if (tile.thread_rank() == 0) {
*(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
}
}
}
idx += loop_stride;
}
}

} // namespace detail
} // namespace static_map_ns
} // namespace cuco
} // namespace cuco::static_map_ns::detail
9 changes: 1 addition & 8 deletions include/cuco/detail/static_map/static_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -374,14 +374,7 @@ template <typename InputIt, typename OutputIt>
void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

static_map_ns::detail::find<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, ref(op::find));
impl_->find_async(first, last, output_begin, ref(op::find), stream);
}

template <class Key,
Expand Down
66 changes: 2 additions & 64 deletions include/cuco/detail/static_set/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,75 +20,15 @@

#include <cub/block/block_reduce.cuh>
#include <cub/block/block_scan.cuh>

#include <cuda/atomic>

#include <cooperative_groups.h>

#include <iterator>

namespace cuco {
namespace static_set_ns {
namespace detail {
namespace cuco::static_set_ns::detail {

CUCO_SUPPRESS_KERNEL_WARNINGS
/**
* @brief Finds the equivalent set elements of all keys in the range `[first, last)`.
*
* If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin +
* i)`. Else, copies the empty key sentinel. Uses the CUDA Cooperative Groups API to leverage groups
* of multiple threads to find each key. This provides a significant boost in throughput compared to
* the non Cooperative Group `find` at moderate to high load factors.
*
* @tparam CGSize Number of threads in each CG
* @tparam BlockSize The size of the thread block
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator assignable from the set's `key_type`
* @tparam Ref Type of non-owning device ref allowing access to storage
*
* @param first Beginning of the sequence of keys
* @param n Number of keys to query
* @param output_begin Beginning of the sequence of matched elements retrieved for each key
* @param ref Non-owning set device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
{
namespace cg = cooperative_groups;

auto const block = cg::this_thread_block();
auto const thread_idx = block.thread_rank();
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

__shared__ typename Ref::key_type output_buffer[BlockSize / CGSize];

while (idx - thread_idx < n) { // the whole thread block falls into the same iteration
if (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
if constexpr (CGSize == 1) {
auto const found = ref.find(key);
/*
* The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
* sector stores from L2 to global memory. By writing results to shared memory and then
* synchronizing before writing back to global, we no longer rely on L1, preventing the
* increase in sector stores from L2 to global and improving performance.
*/
output_buffer[thread_idx] = found == ref.end() ? ref.empty_key_sentinel() : *found;
block.sync();
*(output_begin + idx) = output_buffer[thread_idx];
} else {
auto const tile = cg::tiled_partition<CGSize>(block);
auto const found = ref.find(tile, key);

if (tile.thread_rank() == 0) {
*(output_begin + idx) = found == ref.end() ? ref.empty_key_sentinel() : *found;
}
}
}
idx += loop_stride;
}
}

template <typename CG,
typename Size,
Expand Down Expand Up @@ -342,6 +282,4 @@ CUCO_KERNEL void retrieve(InputIt first,
}
}

} // namespace detail
} // namespace static_set_ns
} // namespace cuco
} // namespace cuco::static_set_ns::detail
9 changes: 1 addition & 8 deletions include/cuco/detail/static_set/static_set.inl
Original file line number Diff line number Diff line change
Expand Up @@ -297,14 +297,7 @@ template <typename InputIt, typename OutputIt>
void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
{
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);

static_set_ns::detail::find<cg_size, cuco::detail::default_block_size()>
<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, ref(op::find));
impl_->find_async(first, last, output_begin, ref(op::find), stream);
}

template <class Key,
Expand Down
6 changes: 2 additions & 4 deletions include/cuco/static_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -569,8 +569,7 @@ class static_map {
*
* @note This function synchronizes the given stream. For asynchronous execution use `find_async`.
* @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
* `element` to
* `(output_begin + i)`. Else, copies the empty value sentinel.
* `element` to `(output_begin + i)`. Else, copies the empty value sentinel.
*
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
Expand All @@ -588,8 +587,7 @@ class static_map {
* equivalent to the query key.
*
* @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
* `element` to
* `(output_begin + i)`. Else, copies the empty value sentinel.
* `element` to `(output_begin + i)`. Else, copies the empty value sentinel.
*
* @tparam InputIt Device accessible input iterator
* @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
Expand Down

0 comments on commit 06c0688

Please sign in to comment.