diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 4e9570bce..b427feff4 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,280 @@ #include namespace cuco { +namespace experimental { +template +constexpr static_multimap:: + static_multimap(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda::stream_ref stream) + : impl_{std::make_unique(capacity, + cuco::pair{empty_key_sentinel, empty_value_sentinel}, + pred, + probing_scheme, + alloc, + stream)}, + empty_value_sentinel_{empty_value_sentinel} +{ +} + +template +constexpr static_multimap:: + static_multimap(Extent n, + double desired_load_factor, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda::stream_ref stream) + : impl_{std::make_unique(n, + desired_load_factor, + cuco::pair{empty_key_sentinel, empty_value_sentinel}, + pred, + probing_scheme, + alloc, + stream)}, + empty_value_sentinel_{empty_value_sentinel} +{ +} + +template +constexpr static_multimap:: + static_multimap(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + Storage, + Allocator const& alloc, + cuda::stream_ref stream) + : impl_{std::make_unique(capacity, + cuco::pair{empty_key_sentinel, empty_value_sentinel}, + erased_key_sentinel, + pred, + probing_scheme, + alloc, + stream)}, + empty_value_sentinel_{empty_value_sentinel} +{ +} + +template +void static_multimap::clear( + cuda::stream_ref stream) +{ + impl_->clear(stream); +} + +template +void static_multimap:: + clear_async(cuda::stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + +template +template +static_multimap::size_type +static_multimap::insert( + InputIt first, InputIt last, cuda::stream_ref stream) +{ + return impl_->insert(first, last, ref(op::insert), stream); +} + +template +template +void static_multimap:: + insert_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept +{ + impl_->insert_async(first, last, ref(op::insert), stream); +} + +template +template +void static_multimap::contains( + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const +{ + this->contains_async(first, last, output_begin, stream); + stream.wait(); +} + +template +template +void static_multimap:: + contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream) const noexcept +{ + impl_->contains_async(first, last, output_begin, ref(op::contains), stream); +} + +template +constexpr auto +static_multimap::capacity() + const noexcept +{ + return impl_->capacity(); +} + +template +constexpr static_multimap:: + key_type + static_multimap:: + empty_key_sentinel() const noexcept +{ + return impl_->empty_key_sentinel(); +} + +template +constexpr static_multimap:: + mapped_type + static_multimap:: + empty_value_sentinel() const noexcept +{ + return empty_value_sentinel_; +} + +template +constexpr static_multimap:: + key_type + static_multimap:: + erased_key_sentinel() const noexcept +{ + return impl_->erased_key_sentinel(); +} + +template +template +auto static_multimap::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return cuco::detail::bitwise_compare(this->empty_key_sentinel(), this->erased_key_sentinel()) + ? ref_type{cuco::empty_key(this->empty_key_sentinel()), + cuco::empty_value(this->empty_value_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + cuda_thread_scope{}, + impl_->storage_ref()} + : ref_type{cuco::empty_key(this->empty_key_sentinel()), + cuco::empty_value(this->empty_value_sentinel()), + cuco::erased_key(this->erased_key_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + cuda_thread_scope{}, + impl_->storage_ref()}; +} +} // namespace experimental template +#include + +#include +#include + +#include + +namespace cuco { + +template +__host__ __device__ constexpr static_multimap_ref< + Key, + T, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_multimap_ref(cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + StorageRef storage_ref) noexcept + : impl_{ + cuco::pair{empty_key_sentinel, empty_value_sentinel}, predicate, probing_scheme, storage_ref} +{ +} + +template +__host__ __device__ constexpr static_multimap_ref< + Key, + T, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_multimap_ref(cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + cuco::erased_key erased_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope, + StorageRef storage_ref) noexcept + : impl_{cuco::pair{empty_key_sentinel, empty_value_sentinel}, + erased_key_sentinel, + predicate, + probing_scheme, + storage_ref} +{ +} + +template +template +__host__ __device__ constexpr static_multimap_ref:: + static_multimap_ref( + static_multimap_ref&& + other) noexcept + : impl_{std::move(other.impl_)} +{ +} + +template +__host__ __device__ constexpr static_multimap_ref::key_equal +static_multimap_ref::key_eq() + const noexcept +{ + return impl_.key_eq(); +} + +template +__host__ __device__ constexpr static_multimap_ref::const_iterator +static_multimap_ref::end() + const noexcept +{ + return impl_.end(); +} + +template +__host__ __device__ constexpr static_multimap_ref::iterator +static_multimap_ref:: + end() noexcept +{ + return this->impl_.end(); +} + +template +__host__ __device__ constexpr auto +static_multimap_ref::capacity() + const noexcept +{ + return impl_.capacity(); +} + +template +__host__ __device__ constexpr static_multimap_ref::extent_type +static_multimap_ref:: + window_extent() const noexcept +{ + return impl_.window_extent(); +} + +template +__host__ __device__ constexpr Key +static_multimap_ref:: + empty_key_sentinel() const noexcept +{ + return impl_.empty_key_sentinel(); +} + +template +__host__ __device__ constexpr T +static_multimap_ref:: + empty_value_sentinel() const noexcept +{ + return impl_.empty_value_sentinel(); +} + +template +__host__ __device__ constexpr Key +static_multimap_ref:: + erased_key_sentinel() const noexcept +{ + return impl_.erased_key_sentinel(); +} + +template +template +auto static_multimap_ref::with( + NewOperators...) && noexcept +{ + return static_multimap_ref{ + std::move(*this)}; +} + +template +template +__host__ __device__ auto constexpr static_multimap_ref< + Key, + T, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::with_operators(NewOperators...) const noexcept +{ + return static_multimap_ref{ + cuco::empty_key{this->empty_key_sentinel()}, + cuco::empty_value{this->empty_value_sentinel()}, + this->key_eq(), + impl_.probing_scheme(), + {}, + impl_.storage_ref()}; +} + +template +template +__device__ constexpr auto +static_multimap_ref::make_copy( + CG const& tile, + window_type* const memory_to_use, + cuda_thread_scope scope) const noexcept +{ + impl_.make_copy(tile, memory_to_use); + return static_multimap_ref{ + cuco::empty_key{this->empty_key_sentinel()}, + cuco::empty_value{this->empty_value_sentinel()}, + cuco::erased_key{this->erased_key_sentinel()}, + this->key_eq(), + impl_.probing_scheme(), + scope, + storage_ref_type{this->window_extent(), memory_to_use}}; +} + +template +template +__device__ constexpr void +static_multimap_ref::initialize( + CG const& tile) noexcept +{ + impl_.initialize(tile); +} + +namespace detail { + +template +class operator_impl< + op::insert_tag, + static_multimap_ref> { + using base_type = static_multimap_ref; + using ref_type = + static_multimap_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using mapped_type = T; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(Value const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + return ref_.impl_.insert(value); + } + + /** + * @brief Inserts an element. + * + * @tparam Value Input type which is convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + Value const& value) noexcept + { + auto& ref_ = static_cast(*this); + return ref_.impl_.insert(group, value); + } +}; + +template +class operator_impl< + op::contains_tag, + static_multimap_ref> { + using base_type = static_multimap_ref; + using ref_type = + static_multimap_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Input key type which is convertible to 'key_type' + * + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(key); + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Input key type which is convertible to 'key_type' + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(group, key); + } +}; +} // namespace detail +} // namespace cuco diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index c75283b54..20c1a1ae2 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,17 @@ #pragma once #include +#include #include #include #include +#include #include #include #include #include +#include #include #if defined(CUCO_HAS_CUDA_BARRIER) @@ -39,6 +42,364 @@ #include namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports + * equivalent keys. + * + * The `static_multimap` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the map. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data structures always place the slot keys on the right-hand side when + * invoking the key comparison predicate, i.e., `pred(query_key, slot_key)`. Order-sensitive + * `KeyEqual` should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given payload type is larger than 8 bytes + * @throw If the size of the given slot type is larger than 16 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given mapped type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam T Type of the mapped values + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = cuco::linear_probing<4, // CG size + cuco::default_hash_function>, + class Allocator = cuco::cuda_allocator>, + class Storage = cuco::storage<1>> +class static_multimap { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + + static_assert(cuco::is_bitwise_comparable_v, + "Mapped type must have unique object representations or have been explicitly " + "declared as safe for bitwise comparison via specialization of " + "cuco::is_bitwise_comparable_v."); + + using impl_type = cuco::detail::open_addressing_impl, + Extent, + Scope, + KeyEqual, + ProbingScheme, + Allocator, + Storage>; + + public: + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key-value pair type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + + using mapped_type = T; ///< Payload type + template + using ref_type = cuco::static_multimap_ref; ///< Non-owning container ref type + + static_multimap(static_multimap const&) = delete; + static_multimap& operator=(static_multimap const&) = delete; + + static_multimap(static_multimap&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_multimap& operator=(static_multimap&&) = default; + ~static_multimap() = default; + + /** + * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values + * and CUDA stream + * + * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the map. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note This constructor doesn't synchronize the given stream. + * + * @param capacity The requested lower-bound map size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_multimap(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); + + /** + * @brief Constructs a statically-sized map with the number of elements to insert `n`, the desired + * load factor, etc + * + * @note This constructor helps users create a map based on the number of elements to insert and + * the desired load factor without manually computing the desired capacity. The actual map + * capacity will be a size no smaller than `ceil(n / desired_load_factor)`. It's determined by + * multiple factors including the given `n`, the desired load factor, the probing scheme, the CG + * size, and the window size and is computed via the `make_window_extent` factory. + * @note Insert operations will not automatically grow the container. + * @note Attempting to insert more unique keys than the capacity of the container results in + * undefined behavior. + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note This constructor doesn't synchronize the given stream. + * @note This overload will convert compile-time extents to runtime constants which might lead to + * performance regressions. + * + * @throw If the desired occupancy is no bigger than zero + * @throw If the desired occupancy is no smaller than one + * + * @param n The number of elements to insert + * @param desired_load_factor The desired load factor of the container, e.g., 0.5 implies a 50% + * load factor + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_multimap(Extent n, + double desired_load_factor, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); + + /** + * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the map. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. + * + * @param capacity The requested lower-bound map size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param erased_key_sentinel The reserved key to denote erased slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage Kind of storage to use + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_multimap(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + cuda_thread_scope scope = {}, + Storage storage = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda::stream_ref stream = {}) noexcept; + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_multimap::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successful insertions + */ + template + size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); + + /** + * @brief Asynchronously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_multimap::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream = {}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the map. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream = {}) const noexcept; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] constexpr key_type erased_key_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_multimap` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + std::unique_ptr impl_; ///< Static map implementation + mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload +}; +} // namespace experimental /** * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh new file mode 100644 index 000000000..fceb10489 --- /dev/null +++ b/include/cuco/static_multimap_ref.cuh @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data structures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given payload type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given payload type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam T Type used for mapped values. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_multimap_ref + : public detail::operator_impl< + Operators, + static_multimap_ref>... { + /// Flag indicating whether duplicate keys are allowed or not + static constexpr auto allows_duplicates = true; + + /// Implementation type + using impl_type = detail:: + open_addressing_ref_impl; + + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + public: + using key_type = Key; ///< Key type + using mapped_type = T; ///< Mapped type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + /** + * @brief Constructs static_multimap_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param empty_value_sentinel Sentinel indicating empty payload + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_multimap_ref( + cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope scope, + StorageRef storage_ref) noexcept; + + /** + * @brief Constructs static_multimap_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param empty_value_sentinel Sentinel indicating empty payload + * @param erased_key_sentinel Sentinel indicating erased key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param scope The scope in which operations will be performed + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_multimap_ref( + cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + cuco::erased_key erased_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + cuda_thread_scope scope, + StorageRef storage_ref) noexcept; + + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_multimap_ref( + static_multimap_ref&& + other) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the window extent of the current storage. + * + * @return The window extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an erased key slot. + * + * @return The sentinel value used to represent an erased key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type erased_key_sentinel() const noexcept; + + /** + * @brief Gets the key comparator. + * + * @return The comparator used to compare keys + */ + [[nodiscard]] __host__ __device__ constexpr key_equal key_eq() const noexcept; + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept; + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept; + + /** + * @brief Creates a reference with new operators from the current object. + * + * @deprecated This function is deprecated. Use the new `with_operators` instead. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + + /** + * @brief Creates a reference with new operators from the current object + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ constexpr auto with_operators( + NewOperators... ops) const noexcept; + + /** + * @brief Makes a copy of the current device reference using non-owned memory + * + * This function is intended to be used to create shared memory copies of small static maps, + * although global memory can be used as well. + * + * @note This function synchronizes the group `tile`. + * @note By-default the thread scope of the copy will be the same as the scope of the parent ref. + * + * @tparam CG The type of the cooperative thread group + * @tparam NewScope The thread scope of the newly created device ref + * + * @param tile The ooperative thread group used to copy the data structure + * @param memory_to_use Array large enough to support `capacity` elements. Object does not take + * the ownership of the memory + * @param scope The thread scope of the newly created device ref + * + * @return Copy of the current device ref + */ + template + [[nodiscard]] __device__ constexpr auto make_copy( + CG const& tile, + window_type* const memory_to_use, + cuda_thread_scope scope = {}) const noexcept; + + /** + * @brief Initializes the map storage using the threads in the group `tile`. + * + * @note This function synchronizes the group `tile`. + * + * @tparam CG The type of the cooperative thread group + * + * @param tile The cooperative thread group used to initialize the map + */ + template + __device__ constexpr void initialize(CG const& tile) noexcept; + + private: + impl_type impl_; ///< Static map ref implementation + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_multimap_ref; +}; + +} // namespace cuco + +#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a42f34a5d..939d3ce65 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -111,6 +111,7 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/custom_pair_retrieve_test.cu static_multimap/custom_type_test.cu static_multimap/heterogeneous_lookup_test.cu + static_multimap/insert_test.cu static_multimap/insert_if_test.cu static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu diff --git a/tests/static_multimap/insert_test.cu b/tests/static_multimap/insert_test.cu new file mode 100644 index 000000000..6b8535a9d --- /dev/null +++ b/tests/static_multimap/insert_test.cu @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +template +void test_insert(Map& map, std::size_t num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto keys_begin = d_keys.begin(); + auto pairs_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type>( + [] __device__(auto i) { return cuco::pair{i, i}; })); + thrust::device_vector d_contained(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + map.insert(pairs_begin, pairs_begin + num_keys); + + SECTION("All inserted keys should be contained.") + { + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multimap insert test", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{4'000}; + + using extent_type = cuco::extent; + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_multimap, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + extent_type{num_keys}, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert(map, num_keys); +}