From 718b078f219d409016b21bc0318e19c233f45653 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 7 Jan 2025 16:26:31 -0500 Subject: [PATCH 1/8] Separate implementation from interface --- cpp/CMakeLists.txt | 6 +- cpp/include/kvikio/cufile/config.hpp | 23 +-- cpp/include/kvikio/cufile/driver.hpp | 232 +++---------------------- cpp/include/kvikio/shim/cuda.hpp | 50 +----- cpp/include/kvikio/shim/cufile.hpp | 127 ++------------ cpp/include/kvikio/shim/libcurl.hpp | 106 ++---------- cpp/include/kvikio/shim/utils.hpp | 48 +---- cpp/src/cufile/config.cpp | 42 +++++ cpp/src/cufile/driver.cpp | 250 +++++++++++++++++++++++++++ cpp/src/shim/cuda.cpp | 72 ++++++++ cpp/src/shim/cufile.cpp | 157 +++++++++++++++++ cpp/src/shim/libcurl.cpp | 134 ++++++++++++++ cpp/src/shim/utils.cpp | 69 ++++++++ 13 files changed, 786 insertions(+), 530 deletions(-) create mode 100644 cpp/src/cufile/config.cpp create mode 100644 cpp/src/cufile/driver.cpp create mode 100644 cpp/src/shim/cuda.cpp create mode 100644 cpp/src/shim/cufile.cpp create mode 100644 cpp/src/shim/libcurl.cpp create mode 100644 cpp/src/shim/utils.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a0639c5382..ea7d29a06f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -131,7 +131,9 @@ include(cmake/thirdparty/get_thread_pool.cmake) # ################################################################################################## # * library targets -------------------------------------------------------------------------------- -set(SOURCES "src/file_handle.cpp") +set(SOURCES "src/file_handle.cpp" "src/cufile/config.cpp" "src/cufile/driver.cpp" + "src/shim/cuda.cpp" "src/shim/cufile.cpp" "src/shim/libcurl.cpp" "src/shim/utils.cpp" +) if(KvikIO_REMOTE_SUPPORT) list(APPEND SOURCES "src/remote_handle.cpp") diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp index b1457880bb..7dd9ee7bcb 100644 --- a/cpp/include/kvikio/cufile/config.hpp +++ b/cpp/include/kvikio/cufile/config.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,24 +15,9 @@ */ #pragma once -#include -#include -#include - #include namespace kvikio { -namespace detail { - -[[nodiscard]] inline const char* lookup_config_path() -{ - const char* env = std::getenv("CUFILE_ENV_PATH_JSON"); - if (env != nullptr && std::filesystem::exists(env)) { return env; } - if (std::filesystem::exists("/etc/cufile.json")) { return "/etc/cufile.json"; } - return ""; -} - -} // namespace detail /** * @brief Get the filepath to cuFile's config file (`cufile.json`) or the empty string @@ -41,10 +26,6 @@ namespace detail { * * @return The filepath to the cufile.json file or the empty string if it isn't found. */ -[[nodiscard]] KVIKIO_EXPORT inline const std::string& config_path() -{ - static const std::string ret = detail::lookup_config_path(); - return ret; -} +[[nodiscard]] KVIKIO_EXPORT const std::string& config_path(); } // namespace kvikio diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp index b609029a69..269761c75d 100644 --- a/cpp/include/kvikio/cufile/driver.hpp +++ b/cpp/include/kvikio/cufile/driver.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,53 +15,24 @@ */ #pragma once -#include #include -#include #include #include namespace kvikio { -namespace detail { - -[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept -{ - return (prop & (1U << flag)) != 0; -} - -inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept -{ - if (val) { - prop |= (1U << flag); - } else { - prop &= ~(1U << flag); - } -} -} // namespace detail - -#ifdef KVIKIO_CUFILE_FOUND class DriverInitializer { // Optional, if not used cuFiles opens the driver automatically public: - DriverInitializer() { cuFileAPI::instance().driver_open(); } + DriverInitializer(); DriverInitializer(DriverInitializer const&) = delete; DriverInitializer& operator=(DriverInitializer const&) = delete; DriverInitializer(DriverInitializer&&) noexcept = delete; DriverInitializer& operator=(DriverInitializer&&) noexcept = delete; - ~DriverInitializer() - { - try { - cuFileAPI::instance().driver_close(); - } catch (const CUfileException& e) { - std::cerr << "Unable to close GDS file driver: "; - std::cerr << e.what(); - std::cerr << std::endl; - } - } + ~DriverInitializer(); }; class DriverProperties { @@ -71,204 +42,45 @@ class DriverProperties { // Because Cython does not handle exceptions in the default // constructor, we initialize `_props` lazily. - void lazy_init() - { - if (_initialized) { return; } - _initialized = true; - CUFILE_TRY(cuFileAPI::instance().DriverGetProperties(&_props)); - } + void lazy_init(); public: +#ifdef KVIKIO_CUFILE_FOUND DriverProperties() = default; - - bool is_gds_available() - { - // If both the major and minor version is zero, the GDS driver isn't loaded. - return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0); - } - - [[nodiscard]] unsigned int get_nvfs_major_version() - { - lazy_init(); - return _props.nvfs.major_version; - } - - [[nodiscard]] unsigned int get_nvfs_minor_version() - { - lazy_init(); - return _props.nvfs.minor_version; - } - - [[nodiscard]] bool get_nvfs_allow_compat_mode() - { - lazy_init(); - return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE); - } - - [[nodiscard]] bool get_nvfs_poll_mode() - { - lazy_init(); - return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE); - } - - [[nodiscard]] std::size_t get_nvfs_poll_thresh_size() - { - lazy_init(); - return _props.nvfs.poll_thresh_size; - } - - void set_nvfs_poll_mode(bool enable) - { - lazy_init(); - CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size())); - detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable); - } - - void set_nvfs_poll_thresh_size(std::size_t size_in_kb) - { - lazy_init(); - CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(get_nvfs_poll_mode(), size_in_kb)); - _props.nvfs.poll_thresh_size = size_in_kb; - } - - [[nodiscard]] std::vector get_nvfs_statusflags() - { - lazy_init(); - std::vector ret; - if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) { - ret.push_back(CU_FILE_USE_POLL_MODE); - } - if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) { - ret.push_back(CU_FILE_ALLOW_COMPAT_MODE); - } - return ret; - } - - [[nodiscard]] std::size_t get_max_device_cache_size() - { - lazy_init(); - return _props.max_device_cache_size; - } - - void set_max_device_cache_size(std::size_t size_in_kb) - { - lazy_init(); - CUFILE_TRY(cuFileAPI::instance().DriverSetMaxCacheSize(size_in_kb)); - _props.max_device_cache_size = size_in_kb; - } - - [[nodiscard]] std::size_t get_per_buffer_cache_size() - { - lazy_init(); - return _props.per_buffer_cache_size; - } - - [[nodiscard]] std::size_t get_max_pinned_memory_size() - { - lazy_init(); - return _props.max_device_pinned_mem_size; - } - - void set_max_pinned_memory_size(std::size_t size_in_kb) - { - lazy_init(); - CUFILE_TRY(cuFileAPI::instance().DriverSetMaxPinnedMemSize(size_in_kb)); - _props.max_device_pinned_mem_size = size_in_kb; - } - - [[nodiscard]] std::size_t get_max_batch_io_size() - { -#ifdef KVIKIO_CUFILE_BATCH_API_FOUND - lazy_init(); - return _props.max_batch_io_size; -#else - return 0; -#endif - } -}; - #else -struct DriverInitializer { // Implement a non-default constructor to avoid `unused variable` warnings downstream - DriverInitializer() {} -}; - -struct DriverProperties { - // Implement a non-default constructor to avoid `unused variable` warnings downstream - DriverProperties() {} + DriverProperties(); +#endif - static bool is_gds_available() { return false; } + bool is_gds_available(); - [[nodiscard]] static unsigned int get_nvfs_major_version() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] unsigned int get_nvfs_major_version(); - [[nodiscard]] static unsigned int get_nvfs_minor_version() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] unsigned int get_nvfs_minor_version(); - [[nodiscard]] static bool get_nvfs_allow_compat_mode() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] bool get_nvfs_allow_compat_mode(); - [[nodiscard]] static bool get_nvfs_poll_mode() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] bool get_nvfs_poll_mode(); - [[nodiscard]] static std::size_t get_nvfs_poll_thresh_size() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::size_t get_nvfs_poll_thresh_size(); - static void set_nvfs_poll_mode(bool enable) - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + void set_nvfs_poll_mode(bool enable); - static void set_nvfs_poll_thresh_size(std::size_t size_in_kb) - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + void set_nvfs_poll_thresh_size(std::size_t size_in_kb); - [[nodiscard]] static std::vector get_nvfs_statusflags() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::vector get_nvfs_statusflags(); - [[nodiscard]] static std::size_t get_max_device_cache_size() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::size_t get_max_device_cache_size(); - static void set_max_device_cache_size(std::size_t size_in_kb) - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + void set_max_device_cache_size(std::size_t size_in_kb); - [[nodiscard]] static std::size_t get_per_buffer_cache_size() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::size_t get_per_buffer_cache_size(); - [[nodiscard]] static std::size_t get_max_pinned_memory_size() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::size_t get_max_pinned_memory_size(); - static void set_max_pinned_memory_size(std::size_t size_in_kb) - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + void set_max_pinned_memory_size(std::size_t size_in_kb); - [[nodiscard]] std::size_t get_max_batch_io_size() - { - throw CUfileException("KvikIO not compiled with cuFile.h"); - } + [[nodiscard]] std::size_t get_max_batch_io_size(); }; -#endif } // namespace kvikio diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp index 606a618736..f868d40f58 100644 --- a/cpp/include/kvikio/shim/cuda.hpp +++ b/cpp/include/kvikio/shim/cuda.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -50,46 +51,13 @@ class cudaAPI { decltype(cuStreamDestroy)* StreamDestroy{nullptr}; private: -#ifdef KVIKIO_CUDA_FOUND - cudaAPI() - { - void* lib = load_library("libcuda.so.1"); - // Notice, the API version loaded must match the version used downstream. That is, - // if a project uses the `_v2` CUDA Driver API or the newest Runtime API, the symbols - // loaded should also be the `_v2` symbols. Thus, we use KVIKIO_STRINGIFY() to get - // the name of the symbol through cude.h. - get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc)); - get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost)); - get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync)); - get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync)); - get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute)); - get_symbol(PointerGetAttributes, lib, KVIKIO_STRINGIFY(cuPointerGetAttributes)); - get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent)); - get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent)); - get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent)); - get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange)); - get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName)); - get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString)); - get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet)); - get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain)); - get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease)); - get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize)); - get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate)); - get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy)); - } -#else - cudaAPI() { throw std::runtime_error("KvikIO not compiled with CUDA support"); } -#endif + cudaAPI(); public: cudaAPI(cudaAPI const&) = delete; void operator=(cudaAPI const&) = delete; - KVIKIO_EXPORT static cudaAPI& instance() - { - static cudaAPI _instance; - return _instance; - } + KVIKIO_EXPORT static cudaAPI& instance(); }; /** @@ -100,15 +68,7 @@ class cudaAPI { * @return The boolean answer */ #ifdef KVIKIO_CUDA_FOUND -inline bool is_cuda_available() -{ - try { - cudaAPI::instance(); - } catch (const std::runtime_error&) { - return false; - } - return true; -} +bool is_cuda_available(); #else constexpr bool is_cuda_available() { return false; } #endif diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp index 5194d45e74..c90fba1fce 100644 --- a/cpp/include/kvikio/shim/cufile.hpp +++ b/cpp/include/kvikio/shim/cufile.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ */ #pragma once -#include -#include - #include #include @@ -64,79 +61,15 @@ class cuFileAPI { int version{0}; private: -#ifdef KVIKIO_CUFILE_FOUND - cuFileAPI() - { - // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is - // a workaround that adds support for all prior versions of libcufile. - void* lib = load_library({"libcufile.so.0", - "libcufile.so.1.3.0" /* 11.7.0 */, - "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */, - "libcufile.so.1.2.0" /* 11.6.0 */, - "libcufile.so.1.1.1" /* 11.5.1 */, - "libcufile.so.1.1.0" /* 11.5.0 */, - "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */, - "libcufile.so.1.0.1" /* 11.4.1 */, - "libcufile.so.1.0.0" /* 11.4.0 */}); - get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister)); - get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister)); - get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead)); - get_symbol(Write, lib, KVIKIO_STRINGIFY(cuFileWrite)); - get_symbol(BufRegister, lib, KVIKIO_STRINGIFY(cuFileBufRegister)); - get_symbol(BufDeregister, lib, KVIKIO_STRINGIFY(cuFileBufDeregister)); - get_symbol(DriverOpen, lib, KVIKIO_STRINGIFY(cuFileDriverOpen)); - get_symbol(DriverClose, lib, KVIKIO_STRINGIFY(cuFileDriverClose)); - get_symbol(DriverGetProperties, lib, KVIKIO_STRINGIFY(cuFileDriverGetProperties)); - get_symbol(DriverSetPollMode, lib, KVIKIO_STRINGIFY(cuFileDriverSetPollMode)); - get_symbol(DriverSetMaxCacheSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxCacheSize)); - get_symbol(DriverSetMaxPinnedMemSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxPinnedMemSize)); - -#ifdef KVIKIO_CUFILE_VERSION_API_FOUND - try { - get_symbol(GetVersion, lib, KVIKIO_STRINGIFY(cuFileGetVersion)); - int ver; - CUfileError_t const error = GetVersion(&ver); - if (error.err == CU_FILE_SUCCESS) { version = ver; } - } catch (std::runtime_error const&) { - } -#endif - - // Some symbols were introduced in later versions, so version guards are required. - // Note: `version` is 0 for cuFile versions prior to v1.8 because `cuFileGetVersion` - // did not exist. As a result, the batch and stream APIs are not loaded in versions - // 1.6 and 1.7, respectively, even though they are available. This trade-off is made - // for improved robustness. - if (version >= 1060) { - get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp)); - get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit)); - get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus)); - get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel)); - get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy)); - } - if (version >= 1070) { - get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync)); - get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync)); - get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister)); - get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister)); - } - - // cuFile is supposed to open and close the driver automatically but - // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults: - // . - if (version < 1050) { driver_open(); } - } + cuFileAPI(); +#ifdef KVIKIO_CUFILE_FOUND // Notice, we have to close the driver at program exit (if we opened it) even though we are // not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the // driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but // this seems to be the lesser of two evils. // [1] - ~cuFileAPI() - { - if (version < 1050) { driver_close(); } - } -#else - cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); } + ~cuFileAPI(); #endif public: @@ -145,11 +78,7 @@ class cuFileAPI { cuFileAPI(cuFileAPI const&&) = delete; void operator=(cuFileAPI const&&) = delete; - KVIKIO_EXPORT static cuFileAPI& instance() - { - static cuFileAPI _instance; - return _instance; - } + KVIKIO_EXPORT static cuFileAPI& instance(); /** * @brief Open the cuFile driver @@ -157,26 +86,12 @@ class cuFileAPI { * cuFile allows multiple calls to `cufileDriverOpen()`, only the first call opens * the driver, but every call should have a matching call to `cufileDriverClose()`. */ - void driver_open() - { - CUfileError_t const error = DriverOpen(); - if (error.err != CU_FILE_SUCCESS) { - throw std::runtime_error(std::string{"Unable to open GDS file driver: "} + - cufileop_status_error(error.err)); - } - } + void driver_open(); /** * @brief Close the cuFile driver */ - void driver_close() - { - CUfileError_t const error = DriverClose(); - if (error.err != CU_FILE_SUCCESS) { - throw std::runtime_error(std::string{"Unable to close GDS file driver: "} + - cufileop_status_error(error.err)); - } - } + void driver_close(); }; /** @@ -187,15 +102,7 @@ class cuFileAPI { * @return The boolean answer */ #ifdef KVIKIO_CUFILE_FOUND -inline bool is_cufile_library_available() -{ - try { - cuFileAPI::instance(); - } catch (const std::runtime_error&) { - return false; - } - return true; -} +bool is_cufile_library_available(); #else constexpr bool is_cufile_library_available() { return false; } #endif @@ -208,10 +115,7 @@ constexpr bool is_cufile_library_available() { return false; } * * @return The boolean answer */ -inline bool is_cufile_available() -{ - return is_cufile_library_available() && run_udev_readable() && !is_running_in_wsl(); -} +bool is_cufile_available(); /** * @brief Get cufile version (or zero if older than v1.8). @@ -225,14 +129,7 @@ inline bool is_cufile_available() * @return The version (1000*major + 10*minor) or zero if older than 1080. */ #ifdef KVIKIO_CUFILE_FOUND -inline int cufile_version() -{ - try { - return cuFileAPI::instance().version; - } catch (std::runtime_error const&) { - return 0; - } -} +int cufile_version(); #else constexpr int cufile_version() { return 0; } #endif @@ -246,7 +143,7 @@ constexpr int cufile_version() { return 0; } * * @return The boolean answer */ -inline bool is_batch_api_available() noexcept { return cufile_version() >= 1060; } +bool is_batch_api_available() noexcept; /** * @brief Check if cuFile's stream (async) API is available. @@ -257,6 +154,6 @@ inline bool is_batch_api_available() noexcept { return cufile_version() >= 1060; * * @return The boolean answer */ -inline bool is_stream_api_available() noexcept { return cufile_version() >= 1070; } +bool is_stream_api_available() noexcept; } // namespace kvikio diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp index 423eff9c60..a4efab02e5 100644 --- a/cpp/include/kvikio/shim/libcurl.hpp +++ b/cpp/include/kvikio/shim/libcurl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,22 +20,14 @@ "cannot include the remote IO API, please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)" #endif -#include #include #include #include -#include #include #include #include -#include -#include -#include -#include -#include - namespace kvikio { /** @@ -65,72 +57,26 @@ class LibCurl { // Curl handles free to be used. std::vector _free_curl_handles{}; - LibCurl() - { - CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT); - if (err != CURLE_OK) { - throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err)); - } - curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW); - if ((ver->features & CURL_VERSION_THREADSAFE) == 0) { - throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled"); - } - } - ~LibCurl() noexcept - { - _free_curl_handles.clear(); - curl_global_cleanup(); - } + LibCurl(); + ~LibCurl() noexcept; public: - static LibCurl& instance() - { - static LibCurl _instance; - return _instance; - } + static LibCurl& instance(); /** * @brief Returns a free curl handle if available. */ - UniqueHandlePtr get_free_handle() - { - UniqueHandlePtr ret; - std::lock_guard const lock(_mutex); - if (!_free_curl_handles.empty()) { - ret = std::move(_free_curl_handles.back()); - _free_curl_handles.pop_back(); - } - return ret; - } + UniqueHandlePtr get_free_handle(); /** * @brief Returns a curl handle, create a new handle if none is available. */ - UniqueHandlePtr get_handle() - { - // Check if we have a free handle available. - UniqueHandlePtr ret = get_free_handle(); - if (ret) { - curl_easy_reset(ret.get()); - } else { - // If not, we create a new handle. - CURL* raw_handle = curl_easy_init(); - if (raw_handle == nullptr) { - throw std::runtime_error("libcurl: call to curl_easy_init() failed"); - } - ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup); - } - return ret; - } + UniqueHandlePtr get_handle(); /** * @brief Retain a curl handle for later use. */ - void retain_handle(UniqueHandlePtr handle) - { - std::lock_guard const lock(_mutex); - _free_curl_handles.push_back(std::move(handle)); - } + void retain_handle(UniqueHandlePtr handle); }; /** @@ -156,23 +102,8 @@ class CurlHandle { * @param source_file Path of source file of the caller (for error messages). * @param source_line Line of source file of the caller (for error messages). */ - CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line) - : _handle{std::move(handle)}, - _source_file(std::move(source_file)), - _source_line(std::move(source_line)) - { - // Need CURLOPT_NOSIGNAL to support threading, see - // - setopt(CURLOPT_NOSIGNAL, 1L); - - // We always set CURLOPT_ERRORBUFFER to get better error messages. - _errbuf[0] = 0; // Set the error buffer as empty. - setopt(CURLOPT_ERRORBUFFER, _errbuf); - - // Make curl_easy_perform() fail when receiving HTTP code errors. - setopt(CURLOPT_FAILONERROR, 1L); - } - ~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); } + CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line); + ~CurlHandle() noexcept; /** * @brief CurlHandle support is not movable or copyable. @@ -185,7 +116,7 @@ class CurlHandle { /** * @brief Get the underlying curl easy handle pointer. */ - CURL* handle() noexcept { return _handle.get(); } + CURL* handle() noexcept; /** * @brief Set option for the curl handle. @@ -212,22 +143,7 @@ class CurlHandle { * * See . */ - void perform() - { - // Perform the curl operation and check for errors. - CURLcode err = curl_easy_perform(handle()); - if (err != CURLE_OK) { - std::string msg(_errbuf); // We can do this because we always initialize `_errbuf` as empty. - std::stringstream ss; - ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; - if (msg.empty()) { - ss << "(" << curl_easy_strerror(err) << ")"; - } else { - ss << "(" << msg << ")"; - } - throw std::runtime_error(ss.str()); - } - } + void perform(); /** * @brief Extract information from a curl handle. diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp index 7a3c439899..bc47be205f 100644 --- a/cpp/include/kvikio/shim/utils.hpp +++ b/cpp/include/kvikio/shim/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,6 @@ #include #include -#include -#include #include namespace kvikio { @@ -46,13 +44,7 @@ namespace kvikio { * @param name Name of the library to load. * @return The library handle. */ -inline void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE) -{ - ::dlerror(); // Clear old errors - void* ret = ::dlopen(name, mode); - if (ret == nullptr) { throw std::runtime_error(::dlerror()); } - return ret; -} +void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE); /** * @brief Load shared library @@ -60,19 +52,8 @@ inline void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL | * @param names Vector of names to try when loading shared library. * @return The library handle. */ -inline void* load_library(const std::vector& names, - int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE) -{ - std::stringstream ss; - for (const char* name : names) { - ss << name << " "; - try { - return load_library(name, mode); - } catch (const std::runtime_error&) { - } - } - throw std::runtime_error("cannot open shared object file, tried: " + ss.str()); -} +void* load_library(const std::vector& names, + int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE); /** * @brief Get symbol using `dlsym` @@ -99,17 +80,7 @@ void get_symbol(T& handle, void* lib, const char* name) * * @return The boolean answer */ -[[nodiscard]] inline bool is_running_in_wsl() -{ - struct utsname buf {}; - int err = ::uname(&buf); - if (err == 0) { - const std::string name(static_cast(buf.release)); - // 'Microsoft' for WSL1 and 'microsoft' for WSL2 - return name.find("icrosoft") != std::string::npos; - } - return false; -} +[[nodiscard]] bool is_running_in_wsl(); /** * @brief Check if `/run/udev` is readable @@ -120,13 +91,6 @@ void get_symbol(T& handle, void* lib, const char* name) * * @return The boolean answer */ -[[nodiscard]] inline bool run_udev_readable() -{ - try { - return std::filesystem::is_directory("/run/udev"); - } catch (const std::filesystem::filesystem_error&) { - return false; - } -} +[[nodiscard]] bool run_udev_readable(); } // namespace kvikio diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp new file mode 100644 index 0000000000..7566c11532 --- /dev/null +++ b/cpp/src/cufile/config.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace kvikio { +namespace detail { + +[[nodiscard]] inline const char* lookup_config_path() +{ + const char* env = std::getenv("CUFILE_ENV_PATH_JSON"); + if (env != nullptr && std::filesystem::exists(env)) { return env; } + if (std::filesystem::exists("/etc/cufile.json")) { return "/etc/cufile.json"; } + return ""; +} + +} // namespace detail + +const std::string& config_path() +{ + static const std::string ret = detail::lookup_config_path(); + return ret; +} + +} // namespace kvikio diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp new file mode 100644 index 0000000000..959a64b33b --- /dev/null +++ b/cpp/src/cufile/driver.cpp @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +namespace kvikio { +namespace detail { + +[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept +{ + return (prop & (1U << flag)) != 0; +} + +inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept +{ + if (val) { + prop |= (1U << flag); + } else { + prop &= ~(1U << flag); + } +} +} // namespace detail + +#ifdef KVIKIO_CUFILE_FOUND + +DriverInitializer::DriverInitializer() { cuFileAPI::instance().driver_open(); } + +DriverInitializer::~DriverInitializer() +{ + try { + cuFileAPI::instance().driver_close(); + } catch (const CUfileException& e) { + std::cerr << "Unable to close GDS file driver: "; + std::cerr << e.what(); + std::cerr << std::endl; + } +} + +// Because Cython does not handle exceptions in the default +// constructor, we initialize `_props` lazily. +void DriverProperties::lazy_init() +{ + if (_initialized) { return; } + _initialized = true; + CUFILE_TRY(cuFileAPI::instance().DriverGetProperties(&_props)); +} + +bool DriverProperties::is_gds_available() +{ + // If both the major and minor version is zero, the GDS driver isn't loaded. + return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0); +} + +[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version() +{ + lazy_init(); + return _props.nvfs.major_version; +} + +[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version() +{ + lazy_init(); + return _props.nvfs.minor_version; +} + +[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode() +{ + lazy_init(); + return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE); +} + +[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode() +{ + lazy_init(); + return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE); +} + +[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size() +{ + lazy_init(); + return _props.nvfs.poll_thresh_size; +} + +void DriverProperties::set_nvfs_poll_mode(bool enable) +{ + lazy_init(); + CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size())); + detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable); +} + +void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) +{ + lazy_init(); + CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(get_nvfs_poll_mode(), size_in_kb)); + _props.nvfs.poll_thresh_size = size_in_kb; +} + +[[nodiscard]] std::vector DriverProperties::get_nvfs_statusflags() +{ + lazy_init(); + std::vector ret; + if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) { + ret.push_back(CU_FILE_USE_POLL_MODE); + } + if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) { + ret.push_back(CU_FILE_ALLOW_COMPAT_MODE); + } + return ret; +} + +[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size() +{ + lazy_init(); + return _props.max_device_cache_size; +} + +void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) +{ + lazy_init(); + CUFILE_TRY(cuFileAPI::instance().DriverSetMaxCacheSize(size_in_kb)); + _props.max_device_cache_size = size_in_kb; +} + +[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size() +{ + lazy_init(); + return _props.per_buffer_cache_size; +} + +[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size() +{ + lazy_init(); + return _props.max_device_pinned_mem_size; +} + +void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) +{ + lazy_init(); + CUFILE_TRY(cuFileAPI::instance().DriverSetMaxPinnedMemSize(size_in_kb)); + _props.max_device_pinned_mem_size = size_in_kb; +} + +[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size() +{ +#ifdef KVIKIO_CUFILE_BATCH_API_FOUND + lazy_init(); + return _props.max_batch_io_size; +#else + return 0; +#endif +} + +#else +DriverInitializer::DriverInitializer() {} + +DriverProperties::DriverProperties() {} + +static bool DriverProperties::is_gds_available() { return false; } + +[[nodiscard]] static unsigned int DriverProperties::get_nvfs_major_version() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static unsigned int DriverProperties::get_nvfs_minor_version() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static bool DriverProperties::get_nvfs_allow_compat_mode() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static bool DriverProperties::get_nvfs_poll_mode() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static std::size_t DriverProperties::get_nvfs_poll_thresh_size() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +static void DriverProperties::set_nvfs_poll_mode(bool enable) +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +static void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static std::vector DriverProperties::get_nvfs_statusflags() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static std::size_t DriverProperties::get_max_device_cache_size() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +static void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static std::size_t DriverProperties::get_per_buffer_cache_size() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] static std::size_t DriverProperties::get_max_pinned_memory_size() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +static void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} + +[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size() +{ + throw CUfileException("KvikIO not compiled with cuFile.h"); +} +#endif + +} // namespace kvikio diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp new file mode 100644 index 0000000000..b6a1b3babf --- /dev/null +++ b/cpp/src/shim/cuda.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace kvikio { + +#ifdef KVIKIO_CUDA_FOUND +cudaAPI::cudaAPI() +{ + void* lib = load_library("libcuda.so.1"); + // Notice, the API version loaded must match the version used downstream. That is, + // if a project uses the `_v2` CUDA Driver API or the newest Runtime API, the symbols + // loaded should also be the `_v2` symbols. Thus, we use KVIKIO_STRINGIFY() to get + // the name of the symbol through cude.h. + get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc)); + get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost)); + get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync)); + get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync)); + get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute)); + get_symbol(PointerGetAttributes, lib, KVIKIO_STRINGIFY(cuPointerGetAttributes)); + get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent)); + get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent)); + get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent)); + get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange)); + get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName)); + get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString)); + get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet)); + get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain)); + get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease)); + get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize)); + get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate)); + get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy)); +} +#else +cudaAPI::cudaAPI() { throw std::runtime_error("KvikIO not compiled with CUDA support"); } +#endif + +cudaAPI& cudaAPI::instance() +{ + static cudaAPI _instance; + return _instance; +} + +#ifdef KVIKIO_CUDA_FOUND +bool is_cuda_available() +{ + try { + cudaAPI::instance(); + } catch (const std::runtime_error&) { + return false; + } + return true; +} +#endif + +} // namespace kvikio diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp new file mode 100644 index 0000000000..a6758cab7b --- /dev/null +++ b/cpp/src/shim/cufile.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +namespace kvikio { + +#ifdef KVIKIO_CUFILE_FOUND +cuFileAPI::cuFileAPI() +{ + // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is + // a workaround that adds support for all prior versions of libcufile. + void* lib = load_library({"libcufile.so.0", + "libcufile.so.1.3.0" /* 11.7.0 */, + "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */, + "libcufile.so.1.2.0" /* 11.6.0 */, + "libcufile.so.1.1.1" /* 11.5.1 */, + "libcufile.so.1.1.0" /* 11.5.0 */, + "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */, + "libcufile.so.1.0.1" /* 11.4.1 */, + "libcufile.so.1.0.0" /* 11.4.0 */}); + get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister)); + get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister)); + get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead)); + get_symbol(Write, lib, KVIKIO_STRINGIFY(cuFileWrite)); + get_symbol(BufRegister, lib, KVIKIO_STRINGIFY(cuFileBufRegister)); + get_symbol(BufDeregister, lib, KVIKIO_STRINGIFY(cuFileBufDeregister)); + get_symbol(DriverOpen, lib, KVIKIO_STRINGIFY(cuFileDriverOpen)); + get_symbol(DriverClose, lib, KVIKIO_STRINGIFY(cuFileDriverClose)); + get_symbol(DriverGetProperties, lib, KVIKIO_STRINGIFY(cuFileDriverGetProperties)); + get_symbol(DriverSetPollMode, lib, KVIKIO_STRINGIFY(cuFileDriverSetPollMode)); + get_symbol(DriverSetMaxCacheSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxCacheSize)); + get_symbol(DriverSetMaxPinnedMemSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxPinnedMemSize)); + +#ifdef KVIKIO_CUFILE_VERSION_API_FOUND + try { + get_symbol(GetVersion, lib, KVIKIO_STRINGIFY(cuFileGetVersion)); + int ver; + CUfileError_t const error = GetVersion(&ver); + if (error.err == CU_FILE_SUCCESS) { version = ver; } + } catch (std::runtime_error const&) { + } +#endif + + // Some symbols were introduced in later versions, so version guards are required. + // Note: `version` is 0 for cuFile versions prior to v1.8 because `cuFileGetVersion` + // did not exist. As a result, the batch and stream APIs are not loaded in versions + // 1.6 and 1.7, respectively, even though they are available. This trade-off is made + // for improved robustness. + if (version >= 1060) { + get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp)); + get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit)); + get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus)); + get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel)); + get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy)); + } + if (version >= 1070) { + get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync)); + get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync)); + get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister)); + get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister)); + } + + // cuFile is supposed to open and close the driver automatically but + // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults: + // . + if (version < 1050) { driver_open(); } +} + +// Notice, we have to close the driver at program exit (if we opened it) even though we are +// not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the +// driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but +// this seems to be the lesser of two evils. +// [1] +cuFileAPI::~cuFileAPI() +{ + if (version < 1050) { driver_close(); } +} +#else +cuFileAPI::cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); } +#endif + +cuFileAPI& cuFileAPI::instance() +{ + static cuFileAPI _instance; + return _instance; +} + +void cuFileAPI::driver_open() +{ + CUfileError_t const error = DriverOpen(); + if (error.err != CU_FILE_SUCCESS) { + throw std::runtime_error(std::string{"Unable to open GDS file driver: "} + + cufileop_status_error(error.err)); + } +} + +void cuFileAPI::driver_close() +{ + CUfileError_t const error = DriverClose(); + if (error.err != CU_FILE_SUCCESS) { + throw std::runtime_error(std::string{"Unable to close GDS file driver: "} + + cufileop_status_error(error.err)); + } +} + +#ifdef KVIKIO_CUFILE_FOUND +bool is_cufile_library_available() +{ + try { + cuFileAPI::instance(); + } catch (const std::runtime_error&) { + return false; + } + return true; +} +#endif + +bool is_cufile_available() +{ + return is_cufile_library_available() && run_udev_readable() && !is_running_in_wsl(); +} + +#ifdef KVIKIO_CUFILE_FOUND +int cufile_version() +{ + try { + return cuFileAPI::instance().version; + } catch (std::runtime_error const&) { + return 0; + } +} +#endif + +bool is_batch_api_available() noexcept { return cufile_version() >= 1060; } + +bool is_stream_api_available() noexcept { return cufile_version() >= 1070; } + +} // namespace kvikio diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp new file mode 100644 index 0000000000..655a7f70fc --- /dev/null +++ b/cpp/src/shim/libcurl.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace kvikio { + +LibCurl::LibCurl() +{ + CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT); + if (err != CURLE_OK) { + throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err)); + } + curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW); + if ((ver->features & CURL_VERSION_THREADSAFE) == 0) { + throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled"); + } +} + +LibCurl::~LibCurl() noexcept +{ + _free_curl_handles.clear(); + curl_global_cleanup(); +} + +LibCurl& LibCurl::instance() +{ + static LibCurl _instance; + return _instance; +} + +LibCurl::UniqueHandlePtr LibCurl::get_free_handle() +{ + UniqueHandlePtr ret; + std::lock_guard const lock(_mutex); + if (!_free_curl_handles.empty()) { + ret = std::move(_free_curl_handles.back()); + _free_curl_handles.pop_back(); + } + return ret; +} + +LibCurl::UniqueHandlePtr LibCurl::get_handle() +{ + // Check if we have a free handle available. + UniqueHandlePtr ret = get_free_handle(); + if (ret) { + curl_easy_reset(ret.get()); + } else { + // If not, we create a new handle. + CURL* raw_handle = curl_easy_init(); + if (raw_handle == nullptr) { + throw std::runtime_error("libcurl: call to curl_easy_init() failed"); + } + ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup); + } + return ret; +} + +void LibCurl::retain_handle(UniqueHandlePtr handle) +{ + std::lock_guard const lock(_mutex); + _free_curl_handles.push_back(std::move(handle)); +} + +CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle, + std::string source_file, + std::string source_line) + : _handle{std::move(handle)}, + _source_file(std::move(source_file)), + _source_line(std::move(source_line)) +{ + // Need CURLOPT_NOSIGNAL to support threading, see + // + setopt(CURLOPT_NOSIGNAL, 1L); + + // We always set CURLOPT_ERRORBUFFER to get better error messages. + _errbuf[0] = 0; // Set the error buffer as empty. + setopt(CURLOPT_ERRORBUFFER, _errbuf); + + // Make curl_easy_perform() fail when receiving HTTP code errors. + setopt(CURLOPT_FAILONERROR, 1L); +} + +CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); } + +CURL* CurlHandle::handle() noexcept { return _handle.get(); } + +void CurlHandle::perform() +{ + // Perform the curl operation and check for errors. + CURLcode err = curl_easy_perform(handle()); + if (err != CURLE_OK) { + std::string msg(_errbuf); // We can do this because we always initialize `_errbuf` as empty. + std::stringstream ss; + ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line; + if (msg.empty()) { + ss << "(" << curl_easy_strerror(err) << ")"; + } else { + ss << "(" << msg << ")"; + } + throw std::runtime_error(ss.str()); + } +} + +} // namespace kvikio diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp new file mode 100644 index 0000000000..ab9afbf648 --- /dev/null +++ b/cpp/src/shim/utils.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace kvikio { + +void* load_library(const char* name, int mode) +{ + ::dlerror(); // Clear old errors + void* ret = ::dlopen(name, mode); + if (ret == nullptr) { throw std::runtime_error(::dlerror()); } + return ret; +} + +void* load_library(const std::vector& names, int mode) +{ + std::stringstream ss; + for (const char* name : names) { + ss << name << " "; + try { + return load_library(name, mode); + } catch (const std::runtime_error&) { + } + } + throw std::runtime_error("cannot open shared object file, tried: " + ss.str()); +} + +[[nodiscard]] bool is_running_in_wsl() +{ + struct utsname buf {}; + int err = ::uname(&buf); + if (err == 0) { + const std::string name(static_cast(buf.release)); + // 'Microsoft' for WSL1 and 'microsoft' for WSL2 + return name.find("icrosoft") != std::string::npos; + } + return false; +} + +[[nodiscard]] bool run_udev_readable() +{ + try { + return std::filesystem::is_directory("/run/udev"); + } catch (const std::filesystem::filesystem_error&) { + return false; + } +} + +} // namespace kvikio From 786b8a2746bf310b9f5ff53401afb8fd296043e8 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 8 Jan 2025 10:50:30 -0500 Subject: [PATCH 2/8] Fixes --- cpp/include/kvikio/cufile/driver.hpp | 49 +++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp index 269761c75d..4a33289bda 100644 --- a/cpp/include/kvikio/cufile/driver.hpp +++ b/cpp/include/kvikio/cufile/driver.hpp @@ -22,6 +22,8 @@ namespace kvikio { +#ifdef KVIKIO_CUFILE_FOUND + class DriverInitializer { // Optional, if not used cuFiles opens the driver automatically public: @@ -45,12 +47,7 @@ class DriverProperties { void lazy_init(); public: -#ifdef KVIKIO_CUFILE_FOUND DriverProperties() = default; -#else - // Implement a non-default constructor to avoid `unused variable` warnings downstream - DriverProperties(); -#endif bool is_gds_available(); @@ -83,4 +80,46 @@ class DriverProperties { [[nodiscard]] std::size_t get_max_batch_io_size(); }; +#else +struct DriverInitializer { + // Implement a non-default constructor to avoid `unused variable` warnings downstream + DriverInitializer(); +}; + +struct DriverProperties { + // Implement a non-default constructor to avoid `unused variable` warnings downstream + DriverProperties(); + + static bool is_gds_available(); + + [[nodiscard]] static unsigned int get_nvfs_major_version(); + + [[nodiscard]] static unsigned int get_nvfs_minor_version(); + + [[nodiscard]] static bool get_nvfs_allow_compat_mode(); + + [[nodiscard]] static bool get_nvfs_poll_mode(); + + [[nodiscard]] static std::size_t get_nvfs_poll_thresh_size(); + + static void set_nvfs_poll_mode(bool enable); + + static void set_nvfs_poll_thresh_size(std::size_t size_in_kb); + + [[nodiscard]] static std::vector get_nvfs_statusflags(); + + [[nodiscard]] static std::size_t get_max_device_cache_size(); + + static void set_max_device_cache_size(std::size_t size_in_kb); + + [[nodiscard]] static std::size_t get_per_buffer_cache_size(); + + [[nodiscard]] static std::size_t get_max_pinned_memory_size(); + + static void set_max_pinned_memory_size(std::size_t size_in_kb); + + [[nodiscard]] std::size_t get_max_batch_io_size(); +}; +#endif + } // namespace kvikio From 074ccff5531938ea3a78d64215a083d38c031f6b Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 8 Jan 2025 11:22:21 -0500 Subject: [PATCH 3/8] Fixes. Continue the work --- cpp/include/kvikio/utils.hpp | 152 +++------------------------- cpp/src/cufile/driver.cpp | 28 +++--- cpp/src/utils.cpp | 186 +++++++++++++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 152 deletions(-) create mode 100644 cpp/src/utils.cpp diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 3cad457ffa..5593c6c9de 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,8 +18,6 @@ #include #include #include -#include -#include #include #include #include @@ -29,7 +27,6 @@ #include #endif -#include #include namespace kvikio { @@ -37,27 +34,11 @@ namespace kvikio { // cuFile defines a page size to 4 KiB inline constexpr std::size_t page_size = 4096; -[[nodiscard]] inline off_t convert_size2off(std::size_t x) -{ - if (x >= static_cast(std::numeric_limits::max())) { - throw CUfileException("size_t argument too large to fit off_t"); - } - return static_cast(x); -} +[[nodiscard]] off_t convert_size2off(std::size_t x); -[[nodiscard]] inline ssize_t convert_size2ssize(std::size_t x) -{ - if (x >= static_cast(std::numeric_limits::max())) { - throw CUfileException("size_t argument too large to fit ssize_t"); - } - return static_cast(x); -} +[[nodiscard]] ssize_t convert_size2ssize(std::size_t x); -[[nodiscard]] inline CUdeviceptr convert_void2deviceptr(const void* devPtr) -{ - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) - return reinterpret_cast(devPtr); -} +[[nodiscard]] CUdeviceptr convert_void2deviceptr(const void* devPtr); /** * @brief Help function to convert value to 64 bit signed integer @@ -91,25 +72,7 @@ template >* = nullptr> * @return The boolean answer */ #ifdef KVIKIO_CUDA_FOUND -inline bool is_host_memory(const void* ptr) -{ - CUpointer_attribute attrs[1] = { - CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - }; - CUmemorytype memtype{}; - void* data[1] = {&memtype}; - CUresult result = - cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr)); - - // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED - if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; } - CUDA_DRIVER_TRY(result); - - // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory - // is unregistered host memory. This is undocumented but how the Runtime CUDA API - // does it to support `cudaMemoryTypeUnregistered`. - return memtype == 0 || memtype == CU_MEMORYTYPE_HOST; -} +bool is_host_memory(const void* ptr); #else constexpr bool is_host_memory(const void* ptr) { return true; } #endif @@ -120,13 +83,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param ptr Device pointer to query * @return The device ordinal */ -[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr) -{ - int ret = 0; - CUDA_DRIVER_TRY( - cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); - return ret; -} +[[nodiscard]] int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr); /** * @brief Given a device ordinal, return the primary context of the device. @@ -136,25 +93,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices * @return Primary CUDA context */ -[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal) -{ - static std::map _cache; - static std::mutex _mutex; - std::lock_guard const lock(_mutex); - - if (_cache.find(ordinal) == _cache.end()) { - CUdevice dev{}; - CUcontext ctx{}; - CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal)); - - // Notice, we let the primary context leak at program exit. We do this because `_cache` - // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main: - // - CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev)); - _cache.emplace(ordinal, ctx); - } - return _cache.at(ordinal); -} +[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal); /** * @brief Return the CUDA context associated the given device pointer, if any. @@ -162,15 +101,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param dev_ptr Device pointer to query * @return Usable CUDA context, if one were found. */ -[[nodiscard]] inline std::optional get_context_associated_pointer(CUdeviceptr dev_ptr) -{ - CUcontext ctx = nullptr; - const CUresult err = - cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); - if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; } - if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } - return {}; -} +[[nodiscard]] std::optional get_context_associated_pointer(CUdeviceptr dev_ptr); /** * @brief Check if the current CUDA context can access the given device pointer @@ -178,15 +109,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param dev_ptr Device pointer to query * @return The boolean answer */ -[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr) -{ - CUdeviceptr current_ctx_dev_ptr{}; - const CUresult err = cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); - if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; } - if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } - return false; -} +[[nodiscard]] bool current_context_can_access_pointer(CUdeviceptr dev_ptr); /** * @brief Return a CUDA context that can be used with the given device pointer @@ -204,28 +127,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param devPtr Device pointer to query * @return Usable CUDA context */ -[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr) -{ - CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); - - // First we check if a context has been associated with `devPtr`. - { - auto ctx = get_context_associated_pointer(dev_ptr); - if (ctx.has_value()) { return ctx.value(); } - } - - // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we - // return the current context. - { - CUcontext ctx = nullptr; - CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; } - } - - // Finally, if we didn't find any usable context, we return the primary context of the - // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure. - return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr)); -} +[[nodiscard]] CUcontext get_context_from_pointer(const void* devPtr); /** * @brief Push CUDA context on creation and pop it on destruction @@ -235,43 +137,17 @@ class PushAndPopContext { CUcontext _ctx; public: - PushAndPopContext(CUcontext ctx) : _ctx{ctx} - { - CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx)); - } + PushAndPopContext(CUcontext ctx); PushAndPopContext(const PushAndPopContext&) = delete; PushAndPopContext& operator=(PushAndPopContext const&) = delete; PushAndPopContext(PushAndPopContext&&) = delete; PushAndPopContext&& operator=(PushAndPopContext&&) = delete; - ~PushAndPopContext() - { - try { - CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException); - } catch (const CUfileException& e) { - std::cerr << e.what() << std::endl; - } - } + ~PushAndPopContext(); }; // Find the base and offset of the memory allocation `devPtr` is in -inline std::tuple get_alloc_info(const void* devPtr, - CUcontext* ctx = nullptr) -{ - auto dev = convert_void2deviceptr(devPtr); - CUdeviceptr base_ptr{}; - std::size_t base_size{}; - CUcontext _ctx{}; - if (ctx != nullptr) { - _ctx = *ctx; - } else { - _ctx = get_context_from_pointer(devPtr); - } - PushAndPopContext context(_ctx); - CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev)); - std::size_t offset = dev - base_ptr; - // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast) - return std::make_tuple(reinterpret_cast(base_ptr), base_size, offset); -} +std::tuple get_alloc_info(const void* devPtr, + CUcontext* ctx = nullptr); template inline bool is_future_done(const T& future) diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp index 959a64b33b..127050ed06 100644 --- a/cpp/src/cufile/driver.cpp +++ b/cpp/src/cufile/driver.cpp @@ -174,69 +174,69 @@ DriverInitializer::DriverInitializer() {} DriverProperties::DriverProperties() {} -static bool DriverProperties::is_gds_available() { return false; } +bool DriverProperties::is_gds_available() { return false; } -[[nodiscard]] static unsigned int DriverProperties::get_nvfs_major_version() +[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static unsigned int DriverProperties::get_nvfs_minor_version() +[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static bool DriverProperties::get_nvfs_allow_compat_mode() +[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static bool DriverProperties::get_nvfs_poll_mode() +[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static std::size_t DriverProperties::get_nvfs_poll_thresh_size() +[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -static void DriverProperties::set_nvfs_poll_mode(bool enable) +void DriverProperties::set_nvfs_poll_mode(bool enable) { throw CUfileException("KvikIO not compiled with cuFile.h"); } -static void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) +void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static std::vector DriverProperties::get_nvfs_statusflags() +[[nodiscard]] std::vector DriverProperties::get_nvfs_statusflags() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static std::size_t DriverProperties::get_max_device_cache_size() +[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -static void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) +void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static std::size_t DriverProperties::get_per_buffer_cache_size() +[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] static std::size_t DriverProperties::get_max_pinned_memory_size() +[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -static void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) +void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) { throw CUfileException("KvikIO not compiled with cuFile.h"); } diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp new file mode 100644 index 0000000000..32834cf3a4 --- /dev/null +++ b/cpp/src/utils.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KVIKIO_CUDA_FOUND +#include +#endif + +#include +#include +#include + +namespace kvikio { + +off_t convert_size2off(std::size_t x) +{ + if (x >= static_cast(std::numeric_limits::max())) { + throw CUfileException("size_t argument too large to fit off_t"); + } + return static_cast(x); +} + +ssize_t convert_size2ssize(std::size_t x) +{ + if (x >= static_cast(std::numeric_limits::max())) { + throw CUfileException("size_t argument too large to fit ssize_t"); + } + return static_cast(x); +} + +CUdeviceptr convert_void2deviceptr(const void* devPtr) +{ + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + return reinterpret_cast(devPtr); +} + +#ifdef KVIKIO_CUDA_FOUND +bool is_host_memory(const void* ptr) +{ + CUpointer_attribute attrs[1] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + }; + CUmemorytype memtype{}; + void* data[1] = {&memtype}; + CUresult result = + cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr)); + + // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED + if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; } + CUDA_DRIVER_TRY(result); + + // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory + // is unregistered host memory. This is undocumented but how the Runtime CUDA API + // does it to support `cudaMemoryTypeUnregistered`. + return memtype == 0 || memtype == CU_MEMORYTYPE_HOST; +} +#endif + +int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr) +{ + int ret = 0; + CUDA_DRIVER_TRY( + cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); + return ret; +} + +CUcontext get_primary_cuda_context(int ordinal) +{ + static std::map _cache; + static std::mutex _mutex; + std::lock_guard const lock(_mutex); + + if (_cache.find(ordinal) == _cache.end()) { + CUdevice dev{}; + CUcontext ctx{}; + CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal)); + + // Notice, we let the primary context leak at program exit. We do this because `_cache` + // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main: + // + CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev)); + _cache.emplace(ordinal, ctx); + } + return _cache.at(ordinal); +} + +std::optional get_context_associated_pointer(CUdeviceptr dev_ptr) +{ + CUcontext ctx = nullptr; + const CUresult err = + cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); + if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return {}; +} + +bool current_context_can_access_pointer(CUdeviceptr dev_ptr) +{ + CUdeviceptr current_ctx_dev_ptr{}; + const CUresult err = cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); + if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return false; +} + +CUcontext get_context_from_pointer(const void* devPtr) +{ + CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); + + // First we check if a context has been associated with `devPtr`. + { + auto ctx = get_context_associated_pointer(dev_ptr); + if (ctx.has_value()) { return ctx.value(); } + } + + // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we + // return the current context. + { + CUcontext ctx = nullptr; + CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); + if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; } + } + + // Finally, if we didn't find any usable context, we return the primary context of the + // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure. + return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr)); +} + +PushAndPopContext::PushAndPopContext(CUcontext ctx) : _ctx{ctx} +{ + CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx)); +} + +PushAndPopContext::~PushAndPopContext() +{ + try { + CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException); + } catch (const CUfileException& e) { + std::cerr << e.what() << std::endl; + } +} + +// Find the base and offset of the memory allocation `devPtr` is in +std::tuple get_alloc_info(const void* devPtr, + CUcontext* ctx = nullptr) +{ + auto dev = convert_void2deviceptr(devPtr); + CUdeviceptr base_ptr{}; + std::size_t base_size{}; + CUcontext _ctx{}; + if (ctx != nullptr) { + _ctx = *ctx; + } else { + _ctx = get_context_from_pointer(devPtr); + } + PushAndPopContext context(_ctx); + CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev)); + std::size_t offset = dev - base_ptr; + // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast) + return std::make_tuple(reinterpret_cast(base_ptr), base_size, offset); +} + +} // namespace kvikio From 76830b9c3c84d74333e516c40313a16a8418318c Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 8 Jan 2025 16:08:32 -0500 Subject: [PATCH 4/8] Complete the initial separation --- cpp/CMakeLists.txt | 18 ++- cpp/include/kvikio/batch.hpp | 84 +++-------- cpp/include/kvikio/bounce_buffer.hpp | 84 ++--------- cpp/include/kvikio/buffer.hpp | 52 ++----- cpp/include/kvikio/defaults.hpp | 175 +++------------------ cpp/include/kvikio/posix_io.hpp | 58 ++----- cpp/include/kvikio/remote_handle.hpp | 200 ++---------------------- cpp/include/kvikio/stream.hpp | 79 ++-------- cpp/src/batch.cpp | 118 +++++++++++++++ cpp/src/bounce_buffer.cpp | 107 +++++++++++++ cpp/src/buffer.cpp | 64 ++++++++ cpp/src/cufile/config.cpp | 6 +- cpp/src/cufile/driver.cpp | 54 +++---- cpp/src/defaults.cpp | 203 +++++++++++++++++++++++++ cpp/src/error.cpp | 17 +++ cpp/src/file_handle.cpp | 6 +- cpp/src/posix_io.cpp | 79 ++++++++++ cpp/src/remote_handle.cpp | 218 ++++++++++++++++++++++++++- cpp/src/shim/utils.cpp | 4 +- cpp/src/stream.cpp | 102 +++++++++++++ cpp/src/utils.cpp | 5 +- 21 files changed, 1054 insertions(+), 679 deletions(-) create mode 100644 cpp/src/batch.cpp create mode 100644 cpp/src/bounce_buffer.cpp create mode 100644 cpp/src/buffer.cpp create mode 100644 cpp/src/defaults.cpp create mode 100644 cpp/src/error.cpp create mode 100644 cpp/src/posix_io.cpp create mode 100644 cpp/src/stream.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ea7d29a06f..5f288d370f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -131,8 +131,22 @@ include(cmake/thirdparty/get_thread_pool.cmake) # ################################################################################################## # * library targets -------------------------------------------------------------------------------- -set(SOURCES "src/file_handle.cpp" "src/cufile/config.cpp" "src/cufile/driver.cpp" - "src/shim/cuda.cpp" "src/shim/cufile.cpp" "src/shim/libcurl.cpp" "src/shim/utils.cpp" +set(SOURCES + "src/batch.cpp" + "src/bounce_buffer.cpp" + "src/buffer.cpp" + "src/cufile/config.cpp" + "src/cufile/driver.cpp" + "src/defaults.cpp" + "src/error.cpp" + "src/file_handle.cpp" + "src/posix_io.cpp" + "src/shim/cuda.cpp" + "src/shim/cufile.cpp" + "src/shim/libcurl.cpp" + "src/shim/utils.cpp" + "src/stream.cpp" + "src/utils.cpp" ) if(KvikIO_REMOTE_SUPPORT) diff --git a/cpp/include/kvikio/batch.hpp b/cpp/include/kvikio/batch.hpp index 7eebbd4df0..9927962a65 100644 --- a/cpp/include/kvikio/batch.hpp +++ b/cpp/include/kvikio/batch.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,36 +73,22 @@ class BatchHandle { * * @param max_num_events The maximum number of operations supported by this instance. */ - BatchHandle(int max_num_events) : _initialized{true}, _max_num_events{max_num_events} - { - CUFILE_TRY(cuFileAPI::instance().BatchIOSetUp(&_handle, max_num_events)); - } + BatchHandle(int max_num_events); /** * @brief BatchHandle support move semantic but isn't copyable */ BatchHandle(const BatchHandle&) = delete; BatchHandle& operator=(BatchHandle const&) = delete; - BatchHandle(BatchHandle&& o) noexcept - : _initialized{std::exchange(o._initialized, false)}, - _max_num_events{std::exchange(o._max_num_events, 0)} - { - _handle = std::exchange(o._handle, CUfileBatchHandle_t{}); - } - ~BatchHandle() noexcept { close(); } + BatchHandle(BatchHandle&& o) noexcept; + ~BatchHandle() noexcept; - [[nodiscard]] bool closed() const noexcept { return !_initialized; } + [[nodiscard]] bool closed() const noexcept; /** * @brief Destroy the batch handle and free up resources */ - void close() noexcept - { - if (closed()) { return; } - _initialized = false; - - cuFileAPI::instance().BatchIODestroy(_handle); - } + void close() noexcept; /** * @brief Submit a vector of batch operations @@ -110,31 +96,7 @@ class BatchHandle { * @param operations The vector of batch operations, which must not exceed the * `max_num_events`. */ - void submit(const std::vector& operations) - { - if (convert_size2ssize(operations.size()) > _max_num_events) { - throw CUfileException("Cannot submit more than the max_num_events)"); - } - std::vector io_batch_params; - io_batch_params.reserve(operations.size()); - for (const auto& op : operations) { - if (op.file_handle.is_compat_mode_preferred()) { - throw CUfileException("Cannot submit a FileHandle opened in compatibility mode"); - } - - io_batch_params.push_back(CUfileIOParams_t{.mode = CUFILE_BATCH, - .u = {.batch = {.devPtr_base = op.devPtr_base, - .file_offset = op.file_offset, - .devPtr_offset = op.devPtr_offset, - .size = op.size}}, - .fh = op.file_handle.handle(), - .opcode = op.opcode, - .cookie = nullptr}); - } - - CUFILE_TRY(cuFileAPI::instance().BatchIOSubmit( - _handle, io_batch_params.size(), io_batch_params.data(), 0)); - } + void submit(const std::vector& operations); /** * @brief Get status of submitted operations @@ -148,16 +110,9 @@ class BatchHandle { */ std::vector status(unsigned min_nr, unsigned max_nr, - struct timespec* timeout = nullptr) - { - std::vector ret; - ret.resize(_max_num_events); - CUFILE_TRY(cuFileAPI::instance().BatchIOGetStatus(_handle, min_nr, &max_nr, &ret[0], timeout)); - ret.resize(max_nr); - return ret; - } - - void cancel() { CUFILE_TRY(cuFileAPI::instance().BatchIOCancel(_handle)); } + struct timespec* timeout = nullptr); + + void cancel(); }; #else @@ -166,24 +121,19 @@ class BatchHandle { public: BatchHandle() noexcept = default; - BatchHandle(int max_num_events) - { - throw CUfileException("BatchHandle requires cuFile's batch API, please build with CUDA v12.1+"); - } + BatchHandle(int max_num_events); - [[nodiscard]] bool closed() const noexcept { return true; } + [[nodiscard]] bool closed() const noexcept; - void close() noexcept {} + void close() noexcept; - void submit(const std::vector& operations) {} + void submit(const std::vector& operations); std::vector status(unsigned min_nr, unsigned max_nr, - struct timespec* timeout = nullptr) - { - return std::vector{}; - } - void cancel() {} + struct timespec* timeout = nullptr); + + void cancel(); }; #endif diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp index 498f1d6f5f..5a7623a6a4 100644 --- a/cpp/include/kvikio/bounce_buffer.hpp +++ b/cpp/include/kvikio/bounce_buffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include @@ -47,18 +46,15 @@ class AllocRetain { std::size_t const _size; public: - Alloc(AllocRetain* manager, void* alloc, std::size_t size) - : _manager(manager), _alloc{alloc}, _size{size} - { - } + Alloc(AllocRetain* manager, void* alloc, std::size_t size); Alloc(Alloc const&) = delete; Alloc& operator=(Alloc const&) = delete; Alloc(Alloc&& o) = delete; Alloc& operator=(Alloc&& o) = delete; - ~Alloc() noexcept { _manager->put(_alloc, _size); } - void* get() noexcept { return _alloc; } - void* get(std::ptrdiff_t offset) noexcept { return static_cast(_alloc) + offset; } - std::size_t size() noexcept { return _size; } + ~Alloc() noexcept; + void* get() noexcept; + void* get(std::ptrdiff_t offset) noexcept; + std::size_t size() noexcept; }; AllocRetain() = default; @@ -77,80 +73,28 @@ class AllocRetain { * * @return The number of bytes cleared */ - std::size_t _clear() - { - std::size_t ret = _free_allocs.size() * _size; - while (!_free_allocs.empty()) { - CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top())); - _free_allocs.pop(); - } - return ret; - } + std::size_t _clear(); /** * @brief Ensure the sizes of the retained allocations match `defaults::bounce_buffer_size()` * * NB: `_mutex` must be taken prior to calling this function. */ - void _ensure_alloc_size() - { - auto const bounce_buffer_size = defaults::bounce_buffer_size(); - if (_size != bounce_buffer_size) { - _clear(); - _size = bounce_buffer_size; - } - } + void _ensure_alloc_size(); public: - [[nodiscard]] Alloc get() - { - std::lock_guard const lock(_mutex); - _ensure_alloc_size(); - - // Check if we have an allocation available - if (!_free_allocs.empty()) { - void* ret = _free_allocs.top(); - _free_allocs.pop(); - return Alloc(this, ret, _size); - } - - // If no available allocation, allocate and register a new one - void* alloc{}; - // Allocate page-locked host memory - CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTREGISTER_PORTABLE)); - return Alloc(this, alloc, _size); - } - - void put(void* alloc, std::size_t size) - { - std::lock_guard const lock(_mutex); - _ensure_alloc_size(); - - // If the size of `alloc` matches the sizes of the retained allocations, - // it is added to the set of free allocation otherwise it is freed. - if (size == _size) { - _free_allocs.push(alloc); - } else { - CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc)); - } - } + [[nodiscard]] Alloc get(); + + void put(void* alloc, std::size_t size); /** * @brief Free all retained allocations * * @return The number of bytes cleared */ - std::size_t clear() - { - std::lock_guard const lock(_mutex); - return _clear(); - } - - KVIKIO_EXPORT static AllocRetain& instance() - { - static AllocRetain _instance; - return _instance; - } + std::size_t clear(); + + KVIKIO_EXPORT static AllocRetain& instance(); AllocRetain(AllocRetain const&) = delete; AllocRetain& operator=(AllocRetain const&) = delete; diff --git a/cpp/include/kvikio/buffer.hpp b/cpp/include/kvikio/buffer.hpp index 85c60b3f90..9cef45a6f0 100644 --- a/cpp/include/kvikio/buffer.hpp +++ b/cpp/include/kvikio/buffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,17 +15,8 @@ */ #pragma once -#include -#include -#include #include -#include -#include -#include -#include -#include - namespace kvikio { /** @@ -44,32 +35,17 @@ namespace kvikio { * streaming buffer that is reused across multiple cuFile IO operations. */ /*NOLINTNEXTLINE(readability-function-cognitive-complexity)*/ -inline void buffer_register(const void* devPtr_base, - std::size_t size, - int flags = 0, - const std::vector& errors_to_ignore = std::vector()) -{ - if (defaults::is_compat_mode_preferred()) { return; } - CUfileError_t status = cuFileAPI::instance().BufRegister(devPtr_base, size, flags); - if (status.err != CU_FILE_SUCCESS) { - // Check if `status.err` is in `errors_to_ignore` - if (std::find(errors_to_ignore.begin(), errors_to_ignore.end(), status.err) == - errors_to_ignore.end()) { - CUFILE_TRY(status); - } - } -} +void buffer_register(const void* devPtr_base, + std::size_t size, + int flags = 0, + const std::vector& errors_to_ignore = std::vector()); /** * @brief deregister an already registered device memory from cuFile * * @param devPtr_base device pointer to deregister */ -inline void buffer_deregister(const void* devPtr_base) -{ - if (defaults::is_compat_mode_preferred()) { return; } - CUFILE_TRY(cuFileAPI::instance().BufDeregister(devPtr_base)); -} +void buffer_deregister(const void* devPtr_base); /** * @brief Register device memory allocation which is part of devPtr. Use this @@ -85,23 +61,15 @@ inline void buffer_deregister(const void* devPtr_base) * @warning This API is intended for usecases where the memory is used as * streaming buffer that is reused across multiple cuFile IO operations. */ -inline void memory_register(const void* devPtr, - int flags = 0, - const std::vector& errors_to_ignore = {}) -{ - auto [base, nbytes, offset] = get_alloc_info(devPtr); - buffer_register(base, nbytes, flags, errors_to_ignore); -} +void memory_register(const void* devPtr, + int flags = 0, + const std::vector& errors_to_ignore = {}); /** * @brief deregister an already registered device memory from cuFile. * * @param devPtr device pointer to deregister */ -inline void memory_deregister(const void* devPtr) -{ - auto [base, nbytes, offset] = get_alloc_info(devPtr); - buffer_deregister(base); -} +void memory_deregister(const void* devPtr); } // namespace kvikio diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp index 91071cbb28..4c87724445 100644 --- a/cpp/include/kvikio/defaults.hpp +++ b/cpp/include/kvikio/defaults.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,13 @@ * limitations under the License. */ +// Enable documentation of the enum. /** * @file */ #pragma once -#include #include #include #include @@ -55,25 +55,7 @@ namespace detail { * - `AUTO` * @return A CompatMode enum. */ -inline CompatMode parse_compat_mode_str(std::string_view compat_mode_str) -{ - // Convert to lowercase - std::string tmp{compat_mode_str}; - std::transform( - tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); }); - - CompatMode res{}; - if (tmp == "on" || tmp == "true" || tmp == "yes" || tmp == "1") { - res = CompatMode::ON; - } else if (tmp == "off" || tmp == "false" || tmp == "no" || tmp == "0") { - res = CompatMode::OFF; - } else if (tmp == "auto") { - res = CompatMode::AUTO; - } else { - throw std::invalid_argument("Unknown compatibility mode: " + std::string{tmp}); - } - return res; -} +CompatMode parse_compat_mode_str(std::string_view compat_mode_str); template T getenv_or(std::string_view env_var_name, T default_val) @@ -92,44 +74,10 @@ T getenv_or(std::string_view env_var_name, T default_val) } template <> -inline bool getenv_or(std::string_view env_var_name, bool default_val) -{ - const auto* env_val = std::getenv(env_var_name.data()); - if (env_val == nullptr) { return default_val; } - try { - // Try parsing `env_var_name` as a integer - return static_cast(std::stoi(env_val)); - } catch (const std::invalid_argument&) { - } - // Convert to lowercase - std::string str{env_val}; - // Special considerations regarding the case conversion: - // - std::tolower() is not an addressable function. Passing it to std::transform() as - // a function pointer, if the compile turns out successful, causes the program behavior - // "unspecified (possibly ill-formed)", hence the lambda. ::tolower() is addressable - // and does not have this problem, but the following item still applies. - // - To avoid UB in std::tolower() or ::tolower(), the character must be cast to unsigned char. - std::transform( - str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); }); - // Trim whitespaces - std::stringstream trimmer; - trimmer << str; - str.clear(); - trimmer >> str; - // Match value - if (str == "true" || str == "on" || str == "yes") { return true; } - if (str == "false" || str == "off" || str == "no") { return false; } - throw std::invalid_argument("unknown config value " + std::string{env_var_name} + "=" + - std::string{env_val}); -} +bool getenv_or(std::string_view env_var_name, bool default_val); template <> -inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val) -{ - auto* env_val = std::getenv(env_var_name.data()); - if (env_val == nullptr) { return default_val; } - return parse_compat_mode_str(env_val); -} +CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val); } // namespace detail @@ -145,54 +93,11 @@ class defaults { std::size_t _gds_threshold; std::size_t _bounce_buffer_size; - static unsigned int get_num_threads_from_env() - { - const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1); - if (ret <= 0) { - throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero"); - } - return ret; - } + static unsigned int get_num_threads_from_env(); - defaults() - { - // Determine the default value of `compat_mode` - { - _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO); - } - // Determine the default value of `task_size` - { - const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024); - if (env <= 0) { - throw std::invalid_argument( - "KVIKIO_TASK_SIZE has to be a positive integer greater than zero"); - } - _task_size = env; - } - // Determine the default value of `gds_threshold` - { - const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024); - if (env < 0) { - throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer"); - } - _gds_threshold = env; - } - // Determine the default value of `bounce_buffer_size` - { - const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024); - if (env <= 0) { - throw std::invalid_argument( - "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero"); - } - _bounce_buffer_size = env; - } - } + defaults(); - KVIKIO_EXPORT static defaults* instance() - { - static defaults _instance; - return &_instance; - } + KVIKIO_EXPORT static defaults* instance(); public: /** @@ -213,7 +118,7 @@ class defaults { * * @return Compatibility mode. */ - [[nodiscard]] static CompatMode compat_mode() { return instance()->_compat_mode; } + [[nodiscard]] static CompatMode compat_mode(); /** * @brief Reset the value of `kvikio::defaults::compat_mode()`. @@ -223,7 +128,7 @@ class defaults { * * @param compat_mode Compatibility mode. */ - static void compat_mode_reset(CompatMode compat_mode) { instance()->_compat_mode = compat_mode; } + static void compat_mode_reset(CompatMode compat_mode); /** * @brief Infer the `AUTO` compatibility mode from the system runtime. @@ -234,16 +139,7 @@ class defaults { * (`ON`/`OFF`/`AUTO`) to two (`ON`/`OFF`) so as to determine the actual I/O path. This function * is lightweight as the inferred result is cached. */ - static CompatMode infer_compat_mode_if_auto(CompatMode compat_mode) - { - if (compat_mode == CompatMode::AUTO) { - static auto inferred_compat_mode_for_auto = []() -> CompatMode { - return is_cufile_available() ? CompatMode::OFF : CompatMode::ON; - }(); - return inferred_compat_mode_for_auto; - } - return compat_mode; - } + static CompatMode infer_compat_mode_if_auto(CompatMode compat_mode); /** * @brief Given a requested compatibility mode, whether it is expected to reduce to `ON`. @@ -260,12 +156,7 @@ class defaults { * @param compat_mode Compatibility mode. * @return Boolean answer. */ - static bool is_compat_mode_preferred(CompatMode compat_mode) - { - return compat_mode == CompatMode::ON || - (compat_mode == CompatMode::AUTO && - defaults::infer_compat_mode_if_auto(compat_mode) == CompatMode::ON); - } + static bool is_compat_mode_preferred(CompatMode compat_mode); /** * @brief Whether the global compatibility mode from class defaults is expected to be `ON`. @@ -281,7 +172,7 @@ class defaults { * * @return Boolean answer. */ - static bool is_compat_mode_preferred() { return is_compat_mode_preferred(compat_mode()); } + static bool is_compat_mode_preferred(); /** * @brief Get the default thread pool. @@ -292,7 +183,7 @@ class defaults { * * @return The the default thread pool instance. */ - [[nodiscard]] static BS::thread_pool& thread_pool() { return instance()->_thread_pool; } + [[nodiscard]] static BS::thread_pool& thread_pool(); /** * @brief Get the number of threads in the default thread pool. @@ -302,10 +193,7 @@ class defaults { * * @return The number of threads. */ - [[nodiscard]] static unsigned int thread_pool_nthreads() - { - return thread_pool().get_thread_count(); - } + [[nodiscard]] static unsigned int thread_pool_nthreads(); /** * @brief Reset the number of threads in the default thread pool. Waits for all currently running @@ -316,13 +204,7 @@ class defaults { * * @param nthreads The number of threads to use. */ - static void thread_pool_nthreads_reset(unsigned int nthreads) - { - if (nthreads == 0) { - throw std::invalid_argument("number of threads must be a positive integer greater than zero"); - } - thread_pool().reset(nthreads); - } + static void thread_pool_nthreads_reset(unsigned int nthreads); /** * @brief Get the default task size used for parallel IO operations. @@ -332,20 +214,14 @@ class defaults { * * @return The default task size in bytes. */ - [[nodiscard]] static std::size_t task_size() { return instance()->_task_size; } + [[nodiscard]] static std::size_t task_size(); /** * @brief Reset the default task size used for parallel IO operations. * * @param nbytes The default task size in bytes. */ - static void task_size_reset(std::size_t nbytes) - { - if (nbytes == 0) { - throw std::invalid_argument("task size must be a positive integer greater than zero"); - } - instance()->_task_size = nbytes; - } + static void task_size_reset(std::size_t nbytes); /** * @brief Get the default GDS threshold, which is the minimum size to use GDS (in bytes). @@ -358,13 +234,13 @@ class defaults { * * @return The default GDS threshold size in bytes. */ - [[nodiscard]] static std::size_t gds_threshold() { return instance()->_gds_threshold; } + [[nodiscard]] static std::size_t gds_threshold(); /** * @brief Reset the default GDS threshold, which is the minimum size to use GDS (in bytes). * @param nbytes The default GDS threshold size in bytes. */ - static void gds_threshold_reset(std::size_t nbytes) { instance()->_gds_threshold = nbytes; } + static void gds_threshold_reset(std::size_t nbytes); /** * @brief Get the size of the bounce buffer used to stage data in host memory. @@ -374,21 +250,14 @@ class defaults { * * @return The bounce buffer size in bytes. */ - [[nodiscard]] static std::size_t bounce_buffer_size() { return instance()->_bounce_buffer_size; } + [[nodiscard]] static std::size_t bounce_buffer_size(); /** * @brief Reset the size of the bounce buffer used to stage data in host memory. * * @param nbytes The bounce buffer size in bytes. */ - static void bounce_buffer_size_reset(std::size_t nbytes) - { - if (nbytes == 0) { - throw std::invalid_argument( - "size of the bounce buffer must be a positive integer greater than zero"); - } - instance()->_bounce_buffer_size = nbytes; - } + static void bounce_buffer_size_reset(std::size_t nbytes); }; } // namespace kvikio diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp index 4327a301ec..7675285d09 100644 --- a/cpp/include/kvikio/posix_io.hpp +++ b/cpp/include/kvikio/posix_io.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,31 +65,9 @@ class StreamsByThread { // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination. ~StreamsByThread() = default; - KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id) - { - static StreamsByThread _instance; + KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id); - // If no current context, we return the null/default stream - if (ctx == nullptr) { return nullptr; } - auto key = std::make_pair(ctx, thd_id); - - // Create a new stream if `ctx` doesn't have one. - if (auto search = _instance._streams.find(key); search == _instance._streams.end()) { - CUstream stream{}; - CUDA_DRIVER_TRY(cudaAPI::instance().StreamCreate(&stream, CU_STREAM_DEFAULT)); - _instance._streams[key] = stream; - return stream; - } else { - return search->second; - } - } - - static CUstream get() - { - CUcontext ctx{nullptr}; - CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - return get(ctx, std::this_thread::get_id()); - } + static CUstream get(); StreamsByThread(const StreamsByThread&) = delete; StreamsByThread& operator=(StreamsByThread const&) = delete; @@ -251,16 +229,11 @@ std::size_t posix_host_write(int fd, const void* buf, std::size_t size, std::siz * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into. * @return Size of bytes that were successfully read. */ -inline std::size_t posix_device_read(int fd, - const void* devPtr_base, - std::size_t size, - std::size_t file_offset, - std::size_t devPtr_offset) -{ - KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size); - return detail::posix_device_io( - fd, devPtr_base, size, file_offset, devPtr_offset); -} +std::size_t posix_device_read(int fd, + const void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset); /** * @brief Write device memory to disk using POSIX @@ -275,15 +248,10 @@ inline std::size_t posix_device_read(int fd, * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into. * @return Size of bytes that were successfully written. */ -inline std::size_t posix_device_write(int fd, - const void* devPtr_base, - std::size_t size, - std::size_t file_offset, - std::size_t devPtr_offset) -{ - KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size); - return detail::posix_device_io( - fd, devPtr_base, size, file_offset, devPtr_offset); -} +std::size_t posix_device_write(int fd, + const void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset); } // namespace kvikio::detail diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index e1b152b23c..ff0741ae3f 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,101 +33,6 @@ #include namespace kvikio { -namespace detail { - -/** - * @brief Bounce buffer in pinned host memory. - * - * @note Is not thread-safe. - */ -class BounceBufferH2D { - CUstream _stream; // The CUDA stream to use. - CUdeviceptr _dev; // The output device buffer. - AllocRetain::Alloc _host_buffer; // The host buffer to bounce data on. - std::ptrdiff_t _dev_offset{0}; // Number of bytes written to `_dev`. - std::ptrdiff_t _host_offset{0}; // Number of bytes written to `_host` (resets on flush). - - public: - /** - * @brief Create a bounce buffer for an output device buffer. - * - * @param stream The CUDA stream used throughout the lifetime of the bounce buffer. - * @param device_buffer The output device buffer (final destination of the data). - */ - BounceBufferH2D(CUstream stream, void* device_buffer) - : _stream{stream}, - _dev{convert_void2deviceptr(device_buffer)}, - _host_buffer{AllocRetain::instance().get()} - { - } - - /** - * @brief The bounce buffer if flushed to device on destruction. - */ - ~BounceBufferH2D() noexcept - { - try { - flush(); - } catch (CUfileException const& e) { - std::cerr << "BounceBufferH2D error on final flush: "; - std::cerr << e.what(); - std::cerr << std::endl; - } - } - - private: - /** - * @brief Write host memory to the output device buffer. - * - * @param src The host memory source. - * @param size Number of bytes to write. - */ - void write_to_device(void const* src, std::size_t size) - { - if (size > 0) { - CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream)); - CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream)); - _dev_offset += size; - } - } - - /** - * @brief Flush the bounce buffer by writing everything to the output device buffer. - */ - void flush() - { - write_to_device(_host_buffer.get(), _host_offset); - _host_offset = 0; - } - - public: - /** - * @brief Write host memory to the bounce buffer (also host memory). - * - * Only when the bounce buffer has been filled up is data copied to the output device buffer. - * - * @param data The host memory source. - * @param size Number of bytes to write. - */ - void write(char const* data, std::size_t size) - { - if (_host_buffer.size() - _host_offset < size) { // Not enough space left in the bounce buffer - flush(); - assert(_host_offset == 0); - } - if (_host_buffer.size() < size) { - // If still not enough space, we just copy the data to the device. This only happens when - // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance - // optimize for this case. - write_to_device(data, size); - } else if (size > 0) { - std::memcpy(_host_buffer.get(_host_offset), data, size); - _host_offset += size; - } - } -}; - -} // namespace detail class CurlHandle; // Prototype @@ -173,9 +78,9 @@ class HttpEndpoint : public RemoteEndpoint { * * @param url The full http url to the remote file. */ - HttpEndpoint(std::string url) : _url{std::move(url)} {} + HttpEndpoint(std::string url); void setopt(CurlHandle& curl) override; - std::string str() const override { return _url; } + std::string str() const override; ~HttpEndpoint() override = default; }; @@ -203,17 +108,7 @@ class S3Endpoint : public RemoteEndpoint { */ static std::string unwrap_or_default(std::optional aws_arg, std::string const& env_var, - std::string const& err_msg = "") - { - if (aws_arg.has_value()) { return std::move(*aws_arg); } - - char const* env = std::getenv(env_var.c_str()); - if (env == nullptr) { - if (err_msg.empty()) { return std::string(); } - throw std::invalid_argument(err_msg); - } - return std::string(env); - } + std::string const& err_msg = ""); public: /** @@ -234,22 +129,7 @@ class S3Endpoint : public RemoteEndpoint { static std::string url_from_bucket_and_object(std::string const& bucket_name, std::string const& object_name, std::optional const& aws_region, - std::optional aws_endpoint_url) - { - auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL"); - std::stringstream ss; - if (endpoint_url.empty()) { - auto const region = - unwrap_or_default(std::move(aws_region), - "AWS_DEFAULT_REGION", - "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set."); - // We default to the official AWS url scheme. - ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name; - } else { - ss << endpoint_url << "/" << bucket_name << "/" << object_name; - } - return ss.str(); - } + std::optional aws_endpoint_url); /** * @brief Given an url like "s3:///", return the name of the bucket and object. @@ -259,14 +139,7 @@ class S3Endpoint : public RemoteEndpoint { * @param s3_url S3 url. * @return Pair of strings: [bucket-name, object-name]. */ - [[nodiscard]] static std::pair parse_s3_url(std::string const& s3_url) - { - // Regular expression to match s3:/// - std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase}; - std::smatch matches; - if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; } - throw std::invalid_argument("Input string does not match the expected S3 URL format."); - } + [[nodiscard]] static std::pair parse_s3_url(std::string const& s3_url); /** * @brief Create a S3 endpoint from a url. @@ -284,46 +157,7 @@ class S3Endpoint : public RemoteEndpoint { S3Endpoint(std::string url, std::optional aws_region = std::nullopt, std::optional aws_access_key = std::nullopt, - std::optional aws_secret_access_key = std::nullopt) - : _url{std::move(url)} - { - // Regular expression to match http[s]:// - std::regex pattern{R"(^https?://.*)", std::regex_constants::icase}; - if (!std::regex_search(_url, pattern)) { - throw std::invalid_argument("url must start with http:// or https://"); - } - - auto const region = - unwrap_or_default(std::move(aws_region), - "AWS_DEFAULT_REGION", - "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set."); - - auto const access_key = - unwrap_or_default(std::move(aws_access_key), - "AWS_ACCESS_KEY_ID", - "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set."); - - auto const secret_access_key = unwrap_or_default( - std::move(aws_secret_access_key), - "AWS_SECRET_ACCESS_KEY", - "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set."); - - // Create the CURLOPT_AWS_SIGV4 option - { - std::stringstream ss; - ss << "aws:amz:" << region << ":s3"; - _aws_sigv4 = ss.str(); - } - // Create the CURLOPT_USERPWD option - // Notice, curl uses `secret_access_key` to generate a AWS V4 signature. It is NOT included - // in the http header. See - // - { - std::stringstream ss; - ss << access_key << ":" << secret_access_key; - _aws_userpwd = ss.str(); - } - } + std::optional aws_secret_access_key = std::nullopt); /** * @brief Create a S3 endpoint from a bucket and object name. @@ -346,17 +180,10 @@ class S3Endpoint : public RemoteEndpoint { std::optional aws_region = std::nullopt, std::optional aws_access_key = std::nullopt, std::optional aws_secret_access_key = std::nullopt, - std::optional aws_endpoint_url = std::nullopt) - : S3Endpoint(url_from_bucket_and_object( - bucket_name, object_name, aws_region, std::move(aws_endpoint_url)), - std::move(aws_region), - std::move(aws_access_key), - std::move(aws_secret_access_key)) - { - } + std::optional aws_endpoint_url = std::nullopt); void setopt(CurlHandle& curl) override; - std::string str() const override { return _url; } + std::string str() const override; ~S3Endpoint() override = default; }; @@ -375,10 +202,7 @@ class RemoteHandle { * @param endpoint Remote endpoint used for subsequent IO. * @param nbytes The size of the remote file (in bytes). */ - RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) - : _endpoint{std::move(endpoint)}, _nbytes{nbytes} - { - } + RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes); /** * @brief Create a new remote handle from an endpoint (infers the file size). @@ -402,14 +226,14 @@ class RemoteHandle { * * @return The number of bytes. */ - [[nodiscard]] std::size_t nbytes() const noexcept { return _nbytes; } + std::size_t nbytes() const noexcept; /** * @brief Get a const reference to the underlying remote endpoint. * * @return The remote endpoint. */ - [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept { return *_endpoint; } + [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept; /** * @brief Read from remote source into buffer (host or device memory). diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp index 9eb9942b7a..4b3e37a980 100644 --- a/cpp/include/kvikio/stream.hpp +++ b/cpp/include/kvikio/stream.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,11 @@ #include #include -#include -#include -#include -#include #include #include +#include + namespace kvikio { /** @@ -63,38 +61,15 @@ class StreamFuture { StreamFuture() noexcept = default; StreamFuture( - void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream) - : _devPtr_base{devPtr_base}, _stream{stream} - { - // Notice, we allocate the arguments using malloc() as specified in the cuFile docs: - // - if ((_val = static_cast(std::malloc(sizeof(ArgByVal)))) == nullptr) { - throw std::bad_alloc{}; - } - *_val = { - .size = size, .file_offset = file_offset, .devPtr_offset = devPtr_offset, .bytes_done = 0}; - } + void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream); /** * @brief StreamFuture support move semantic but isn't copyable */ StreamFuture(const StreamFuture&) = delete; StreamFuture& operator=(StreamFuture& o) = delete; - StreamFuture(StreamFuture&& o) noexcept - : _devPtr_base{std::exchange(o._devPtr_base, nullptr)}, - _stream{std::exchange(o._stream, nullptr)}, - _val{std::exchange(o._val, nullptr)}, - _stream_synchronized{o._stream_synchronized} - { - } - StreamFuture& operator=(StreamFuture&& o) noexcept - { - _devPtr_base = std::exchange(o._devPtr_base, nullptr); - _stream = std::exchange(o._stream, nullptr); - _val = std::exchange(o._val, nullptr); - _stream_synchronized = o._stream_synchronized; - return *this; - } + StreamFuture(StreamFuture&& o) noexcept; + StreamFuture& operator=(StreamFuture&& o) noexcept; /** * @brief Return the arguments of the future call @@ -102,18 +77,7 @@ class StreamFuture { * @return Tuple of the arguments in the order matching `FileHandle.read()` and * `FileHandle.write()` */ - std::tuple get_args() const - { - if (_val == nullptr) { - throw kvikio::CUfileException("cannot get arguments from an uninitialized StreamFuture"); - } - return {_devPtr_base, - &_val->size, - &_val->file_offset, - &_val->devPtr_offset, - &_val->bytes_done, - _stream}; - } + std::tuple get_args() const; /** * @brief Return the number of bytes read or written by the future operation. @@ -122,38 +86,13 @@ class StreamFuture { * * @return Number of bytes read or written by the future operation. */ - std::size_t check_bytes_done() - { - if (_val == nullptr) { - throw kvikio::CUfileException("cannot check bytes done on an uninitialized StreamFuture"); - } - - if (!_stream_synchronized) { - _stream_synchronized = true; - CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream)); - } - - CUFILE_CHECK_BYTES_DONE(_val->bytes_done); - // At this point, we know `_val->bytes_done` is a positive value otherwise - // CUFILE_CHECK_BYTES_DONE() would have raised an exception. - return static_cast(_val->bytes_done); - } + std::size_t check_bytes_done(); /** * @brief Free the by-value arguments and make sure the associated CUDA stream has been * synchronized. */ - ~StreamFuture() noexcept - { - if (_val != nullptr) { - try { - check_bytes_done(); - } catch (const kvikio::CUfileException& e) { - std::cerr << e.what() << std::endl; - } - std::free(_val); - } - } + ~StreamFuture() noexcept; }; } // namespace kvikio diff --git a/cpp/src/batch.cpp b/cpp/src/batch.cpp new file mode 100644 index 0000000000..8ced70cbd8 --- /dev/null +++ b/cpp/src/batch.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace kvikio { + +#ifdef KVIKIO_CUFILE_BATCH_API_FOUND + +BatchHandle::BatchHandle(int max_num_events) : _initialized{true}, _max_num_events{max_num_events} +{ + CUFILE_TRY(cuFileAPI::instance().BatchIOSetUp(&_handle, max_num_events)); +} + +BatchHandle::BatchHandle(BatchHandle&& o) noexcept + : _initialized{std::exchange(o._initialized, false)}, + _max_num_events{std::exchange(o._max_num_events, 0)} +{ + _handle = std::exchange(o._handle, CUfileBatchHandle_t{}); +} + +BatchHandle::~BatchHandle() noexcept { close(); } + +bool BatchHandle::closed() const noexcept { return !_initialized; } + +void BatchHandle::close() noexcept +{ + if (closed()) { return; } + _initialized = false; + + cuFileAPI::instance().BatchIODestroy(_handle); +} + +void BatchHandle::submit(const std::vector& operations) +{ + if (convert_size2ssize(operations.size()) > _max_num_events) { + throw CUfileException("Cannot submit more than the max_num_events)"); + } + std::vector io_batch_params; + io_batch_params.reserve(operations.size()); + for (const auto& op : operations) { + if (op.file_handle.is_compat_mode_preferred()) { + throw CUfileException("Cannot submit a FileHandle opened in compatibility mode"); + } + + io_batch_params.push_back(CUfileIOParams_t{.mode = CUFILE_BATCH, + .u = {.batch = {.devPtr_base = op.devPtr_base, + .file_offset = op.file_offset, + .devPtr_offset = op.devPtr_offset, + .size = op.size}}, + .fh = op.file_handle.handle(), + .opcode = op.opcode, + .cookie = nullptr}); + } + + CUFILE_TRY(cuFileAPI::instance().BatchIOSubmit( + _handle, io_batch_params.size(), io_batch_params.data(), 0)); +} + +std::vector BatchHandle::status(unsigned min_nr, + unsigned max_nr, + struct timespec* timeout) +{ + std::vector ret; + ret.resize(_max_num_events); + CUFILE_TRY(cuFileAPI::instance().BatchIOGetStatus(_handle, min_nr, &max_nr, &ret[0], timeout)); + ret.resize(max_nr); + return ret; +} + +void BatchHandle::cancel() { CUFILE_TRY(cuFileAPI::instance().BatchIOCancel(_handle)); } + +#else + +BatchHandle::BatchHandle(int max_num_events) +{ + throw CUfileException("BatchHandle requires cuFile's batch API, please build with CUDA v12.1+"); +} + +bool BatchHandle::closed() const noexcept { return true; } + +void BatchHandle::close() noexcept {} + +void BatchHandle::submit(const std::vector& operations) {} + +std::vector BatchHandle::status(unsigned min_nr, + unsigned max_nr, + struct timespec* timeout) +{ + return std::vector{}; +} + +void BatchHandle::cancel() {} + +#endif + +} // namespace kvikio diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp new file mode 100644 index 0000000000..65ca1aaa52 --- /dev/null +++ b/cpp/src/bounce_buffer.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +namespace kvikio { + +AllocRetain::Alloc::Alloc(AllocRetain* manager, void* alloc, std::size_t size) + : _manager(manager), _alloc{alloc}, _size{size} +{ +} + +AllocRetain::Alloc::~Alloc() noexcept { _manager->put(_alloc, _size); } + +void* AllocRetain::Alloc::get() noexcept { return _alloc; } + +void* AllocRetain::Alloc::get(std::ptrdiff_t offset) noexcept +{ + return static_cast(_alloc) + offset; +} + +std::size_t AllocRetain::Alloc::size() noexcept { return _size; } + +std::size_t AllocRetain::_clear() +{ + std::size_t ret = _free_allocs.size() * _size; + while (!_free_allocs.empty()) { + CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top())); + _free_allocs.pop(); + } + return ret; +} + +void AllocRetain::_ensure_alloc_size() +{ + auto const bounce_buffer_size = defaults::bounce_buffer_size(); + if (_size != bounce_buffer_size) { + _clear(); + _size = bounce_buffer_size; + } +} + +AllocRetain::Alloc AllocRetain::get() +{ + std::lock_guard const lock(_mutex); + _ensure_alloc_size(); + + // Check if we have an allocation available + if (!_free_allocs.empty()) { + void* ret = _free_allocs.top(); + _free_allocs.pop(); + return Alloc(this, ret, _size); + } + + // If no available allocation, allocate and register a new one + void* alloc{}; + // Allocate page-locked host memory + CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTREGISTER_PORTABLE)); + return Alloc(this, alloc, _size); +} + +void AllocRetain::put(void* alloc, std::size_t size) +{ + std::lock_guard const lock(_mutex); + _ensure_alloc_size(); + + // If the size of `alloc` matches the sizes of the retained allocations, + // it is added to the set of free allocation otherwise it is freed. + if (size == _size) { + _free_allocs.push(alloc); + } else { + CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc)); + } +} + +std::size_t AllocRetain::clear() +{ + std::lock_guard const lock(_mutex); + return _clear(); +} + +AllocRetain& AllocRetain::instance() +{ + static AllocRetain _instance; + return _instance; +} + +} // namespace kvikio diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp new file mode 100644 index 0000000000..0aa772d50f --- /dev/null +++ b/cpp/src/buffer.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace kvikio { + +void buffer_register(const void* devPtr_base, + std::size_t size, + int flags, + const std::vector& errors_to_ignore) +{ + if (defaults::is_compat_mode_preferred()) { return; } + CUfileError_t status = cuFileAPI::instance().BufRegister(devPtr_base, size, flags); + if (status.err != CU_FILE_SUCCESS) { + // Check if `status.err` is in `errors_to_ignore` + if (std::find(errors_to_ignore.begin(), errors_to_ignore.end(), status.err) == + errors_to_ignore.end()) { + CUFILE_TRY(status); + } + } +} + +void buffer_deregister(const void* devPtr_base) +{ + if (defaults::is_compat_mode_preferred()) { return; } + CUFILE_TRY(cuFileAPI::instance().BufDeregister(devPtr_base)); +} + +void memory_register(const void* devPtr, int flags, const std::vector& errors_to_ignore) +{ + auto [base, nbytes, offset] = get_alloc_info(devPtr); + buffer_register(base, nbytes, flags, errors_to_ignore); +} + +void memory_deregister(const void* devPtr) +{ + auto [base, nbytes, offset] = get_alloc_info(devPtr); + buffer_deregister(base); +} + +} // namespace kvikio diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp index 7566c11532..2abbf33e92 100644 --- a/cpp/src/cufile/config.cpp +++ b/cpp/src/cufile/config.cpp @@ -21,7 +21,7 @@ #include namespace kvikio { -namespace detail { +namespace { [[nodiscard]] inline const char* lookup_config_path() { @@ -31,11 +31,11 @@ namespace detail { return ""; } -} // namespace detail +} // namespace const std::string& config_path() { - static const std::string ret = detail::lookup_config_path(); + static const std::string ret = lookup_config_path(); return ret; } diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp index 127050ed06..13a23f547c 100644 --- a/cpp/src/cufile/driver.cpp +++ b/cpp/src/cufile/driver.cpp @@ -23,7 +23,7 @@ #include namespace kvikio { -namespace detail { +namespace { [[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept { @@ -38,7 +38,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe prop &= ~(1U << flag); } } -} // namespace detail +} // namespace #ifdef KVIKIO_CUFILE_FOUND @@ -70,31 +70,31 @@ bool DriverProperties::is_gds_available() return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0); } -[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version() +unsigned int DriverProperties::get_nvfs_major_version() { lazy_init(); return _props.nvfs.major_version; } -[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version() +unsigned int DriverProperties::get_nvfs_minor_version() { lazy_init(); return _props.nvfs.minor_version; } -[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode() +bool DriverProperties::get_nvfs_allow_compat_mode() { lazy_init(); - return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE); + return get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE); } -[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode() +bool DriverProperties::get_nvfs_poll_mode() { lazy_init(); - return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE); + return get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE); } -[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size() +std::size_t DriverProperties::get_nvfs_poll_thresh_size() { lazy_init(); return _props.nvfs.poll_thresh_size; @@ -104,7 +104,7 @@ void DriverProperties::set_nvfs_poll_mode(bool enable) { lazy_init(); CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size())); - detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable); + set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable); } void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) @@ -114,20 +114,20 @@ void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) _props.nvfs.poll_thresh_size = size_in_kb; } -[[nodiscard]] std::vector DriverProperties::get_nvfs_statusflags() +std::vector DriverProperties::get_nvfs_statusflags() { lazy_init(); std::vector ret; - if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) { + if (get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) { ret.push_back(CU_FILE_USE_POLL_MODE); } - if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) { + if (get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) { ret.push_back(CU_FILE_ALLOW_COMPAT_MODE); } return ret; } -[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size() +std::size_t DriverProperties::get_max_device_cache_size() { lazy_init(); return _props.max_device_cache_size; @@ -140,13 +140,13 @@ void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) _props.max_device_cache_size = size_in_kb; } -[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size() +std::size_t DriverProperties::get_per_buffer_cache_size() { lazy_init(); return _props.per_buffer_cache_size; } -[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size() +std::size_t DriverProperties::get_max_pinned_memory_size() { lazy_init(); return _props.max_device_pinned_mem_size; @@ -159,7 +159,7 @@ void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) _props.max_device_pinned_mem_size = size_in_kb; } -[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size() +std::size_t DriverProperties::get_max_batch_io_size() { #ifdef KVIKIO_CUFILE_BATCH_API_FOUND lazy_init(); @@ -176,27 +176,27 @@ DriverProperties::DriverProperties() {} bool DriverProperties::is_gds_available() { return false; } -[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version() +unsigned int DriverProperties::get_nvfs_major_version() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version() +unsigned int DriverProperties::get_nvfs_minor_version() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode() +bool DriverProperties::get_nvfs_allow_compat_mode() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode() +bool DriverProperties::get_nvfs_poll_mode() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size() +std::size_t DriverProperties::get_nvfs_poll_thresh_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } @@ -211,12 +211,12 @@ void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb) throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::vector DriverProperties::get_nvfs_statusflags() +std::vector DriverProperties::get_nvfs_statusflags() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size() +std::size_t DriverProperties::get_max_device_cache_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } @@ -226,12 +226,12 @@ void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb) throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size() +std::size_t DriverProperties::get_per_buffer_cache_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size() +std::size_t DriverProperties::get_max_pinned_memory_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } @@ -241,7 +241,7 @@ void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb) throw CUfileException("KvikIO not compiled with cuFile.h"); } -[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size() +std::size_t DriverProperties::get_max_batch_io_size() { throw CUfileException("KvikIO not compiled with cuFile.h"); } diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp new file mode 100644 index 0000000000..f249a8b361 --- /dev/null +++ b/cpp/src/defaults.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace kvikio { + +namespace detail { +CompatMode parse_compat_mode_str(std::string_view compat_mode_str) +{ + // Convert to lowercase + std::string tmp{compat_mode_str}; + std::transform( + tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); }); + + CompatMode res{}; + if (tmp == "on" || tmp == "true" || tmp == "yes" || tmp == "1") { + res = CompatMode::ON; + } else if (tmp == "off" || tmp == "false" || tmp == "no" || tmp == "0") { + res = CompatMode::OFF; + } else if (tmp == "auto") { + res = CompatMode::AUTO; + } else { + throw std::invalid_argument("Unknown compatibility mode: " + std::string{tmp}); + } + return res; +} + +template <> +inline bool getenv_or(std::string_view env_var_name, bool default_val) +{ + const auto* env_val = std::getenv(env_var_name.data()); + if (env_val == nullptr) { return default_val; } + try { + // Try parsing `env_var_name` as a integer + return static_cast(std::stoi(env_val)); + } catch (const std::invalid_argument&) { + } + // Convert to lowercase + std::string str{env_val}; + // Special considerations regarding the case conversion: + // - std::tolower() is not an addressable function. Passing it to std::transform() as + // a function pointer, if the compile turns out successful, causes the program behavior + // "unspecified (possibly ill-formed)", hence the lambda. ::tolower() is addressable + // and does not have this problem, but the following item still applies. + // - To avoid UB in std::tolower() or ::tolower(), the character must be cast to unsigned char. + std::transform( + str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); }); + // Trim whitespaces + std::stringstream trimmer; + trimmer << str; + str.clear(); + trimmer >> str; + // Match value + if (str == "true" || str == "on" || str == "yes") { return true; } + if (str == "false" || str == "off" || str == "no") { return false; } + throw std::invalid_argument("unknown config value " + std::string{env_var_name} + "=" + + std::string{env_val}); +} + +template <> +inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val) +{ + auto* env_val = std::getenv(env_var_name.data()); + if (env_val == nullptr) { return default_val; } + return parse_compat_mode_str(env_val); +} + +} // namespace detail + +unsigned int defaults::get_num_threads_from_env() +{ + const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1); + if (ret <= 0) { + throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero"); + } + return ret; +} + +defaults::defaults() +{ + // Determine the default value of `compat_mode` + { + _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO); + } + // Determine the default value of `task_size` + { + const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024); + if (env <= 0) { + throw std::invalid_argument( + "KVIKIO_TASK_SIZE has to be a positive integer greater than zero"); + } + _task_size = env; + } + // Determine the default value of `gds_threshold` + { + const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024); + if (env < 0) { + throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer"); + } + _gds_threshold = env; + } + // Determine the default value of `bounce_buffer_size` + { + const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024); + if (env <= 0) { + throw std::invalid_argument( + "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero"); + } + _bounce_buffer_size = env; + } +} + +defaults* defaults::instance() +{ + static defaults _instance; + return &_instance; +} +CompatMode defaults::compat_mode() { return instance()->_compat_mode; } + +void defaults::compat_mode_reset(CompatMode compat_mode) { instance()->_compat_mode = compat_mode; } + +CompatMode defaults::infer_compat_mode_if_auto(CompatMode compat_mode) +{ + if (compat_mode == CompatMode::AUTO) { + static auto inferred_compat_mode_for_auto = []() -> CompatMode { + return is_cufile_available() ? CompatMode::OFF : CompatMode::ON; + }(); + return inferred_compat_mode_for_auto; + } + return compat_mode; +} + +bool defaults::is_compat_mode_preferred(CompatMode compat_mode) +{ + return compat_mode == CompatMode::ON || + (compat_mode == CompatMode::AUTO && + defaults::infer_compat_mode_if_auto(compat_mode) == CompatMode::ON); +} + +bool defaults::is_compat_mode_preferred() { return is_compat_mode_preferred(compat_mode()); } + +BS::thread_pool& defaults::thread_pool() { return instance()->_thread_pool; } + +unsigned int defaults::thread_pool_nthreads() { return thread_pool().get_thread_count(); } + +void defaults::thread_pool_nthreads_reset(unsigned int nthreads) +{ + if (nthreads == 0) { + throw std::invalid_argument("number of threads must be a positive integer greater than zero"); + } + thread_pool().reset(nthreads); +} + +std::size_t defaults::task_size() { return instance()->_task_size; } + +void defaults::task_size_reset(std::size_t nbytes) +{ + if (nbytes == 0) { + throw std::invalid_argument("task size must be a positive integer greater than zero"); + } + instance()->_task_size = nbytes; +} + +std::size_t defaults::gds_threshold() { return instance()->_gds_threshold; } + +void defaults::gds_threshold_reset(std::size_t nbytes) { instance()->_gds_threshold = nbytes; } + +std::size_t defaults::bounce_buffer_size() { return instance()->_bounce_buffer_size; } + +void defaults::bounce_buffer_size_reset(std::size_t nbytes) +{ + if (nbytes == 0) { + throw std::invalid_argument( + "size of the bounce buffer must be a positive integer greater than zero"); + } + instance()->_bounce_buffer_size = nbytes; +} + +} // namespace kvikio diff --git a/cpp/src/error.cpp b/cpp/src/error.cpp new file mode 100644 index 0000000000..21ce736a65 --- /dev/null +++ b/cpp/src/error.cpp @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp index f2b7fa15db..37e0f7729b 100644 --- a/cpp/src/file_handle.cpp +++ b/cpp/src/file_handle.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -163,9 +163,9 @@ FileHandle::FileHandle(const std::string& file_path, } } -[[nodiscard]] int FileHandle::fd_open_flags() const { return open_flags(_fd_direct_off); } +int FileHandle::fd_open_flags() const { return open_flags(_fd_direct_off); } -[[nodiscard]] std::size_t FileHandle::nbytes() const +std::size_t FileHandle::nbytes() const { if (closed()) { return 0; } if (_nbytes == 0) { _nbytes = get_file_size(_fd_direct_off); } diff --git a/cpp/src/posix_io.cpp b/cpp/src/posix_io.cpp new file mode 100644 index 0000000000..d4ee2944e5 --- /dev/null +++ b/cpp/src/posix_io.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace kvikio::detail { + +CUstream StreamsByThread::get(CUcontext ctx, std::thread::id thd_id) +{ + static StreamsByThread _instance; + + // If no current context, we return the null/default stream + if (ctx == nullptr) { return nullptr; } + auto key = std::make_pair(ctx, thd_id); + + // Create a new stream if `ctx` doesn't have one. + if (auto search = _instance._streams.find(key); search == _instance._streams.end()) { + CUstream stream{}; + CUDA_DRIVER_TRY(cudaAPI::instance().StreamCreate(&stream, CU_STREAM_DEFAULT)); + _instance._streams[key] = stream; + return stream; + } else { + return search->second; + } +} + +CUstream StreamsByThread::get() +{ + CUcontext ctx{nullptr}; + CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); + return get(ctx, std::this_thread::get_id()); +} + +std::size_t posix_device_read(int fd, + const void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset) +{ + KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size); + return detail::posix_device_io( + fd, devPtr_base, size, file_offset, devPtr_offset); +} + +std::size_t posix_device_write(int fd, + const void* devPtr_base, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset) +{ + KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size); + return detail::posix_device_io( + fd, devPtr_base, size, file_offset, devPtr_offset); +} + +} // namespace kvikio::detail diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index adcf56befc..9fd0690891 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,106 @@ namespace kvikio { +namespace { + +/** + * @brief Bounce buffer in pinned host memory. + * + * @note Is not thread-safe. + */ +class BounceBufferH2D { + CUstream _stream; // The CUDA stream to use. + CUdeviceptr _dev; // The output device buffer. + AllocRetain::Alloc _host_buffer; // The host buffer to bounce data on. + std::ptrdiff_t _dev_offset{0}; // Number of bytes written to `_dev`. + std::ptrdiff_t _host_offset{0}; // Number of bytes written to `_host` (resets on flush). + + public: + /** + * @brief Create a bounce buffer for an output device buffer. + * + * @param stream The CUDA stream used throughout the lifetime of the bounce buffer. + * @param device_buffer The output device buffer (final destination of the data). + */ + BounceBufferH2D(CUstream stream, void* device_buffer) + : _stream{stream}, + _dev{convert_void2deviceptr(device_buffer)}, + _host_buffer{AllocRetain::instance().get()} + { + } + + /** + * @brief The bounce buffer if flushed to device on destruction. + */ + ~BounceBufferH2D() noexcept + { + try { + flush(); + } catch (CUfileException const& e) { + std::cerr << "BounceBufferH2D error on final flush: "; + std::cerr << e.what(); + std::cerr << std::endl; + } + } + + private: + /** + * @brief Write host memory to the output device buffer. + * + * @param src The host memory source. + * @param size Number of bytes to write. + */ + void write_to_device(void const* src, std::size_t size) + { + if (size > 0) { + CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream)); + CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream)); + _dev_offset += size; + } + } + + /** + * @brief Flush the bounce buffer by writing everything to the output device buffer. + */ + void flush() + { + write_to_device(_host_buffer.get(), _host_offset); + _host_offset = 0; + } + + public: + /** + * @brief Write host memory to the bounce buffer (also host memory). + * + * Only when the bounce buffer has been filled up is data copied to the output device buffer. + * + * @param data The host memory source. + * @param size Number of bytes to write. + */ + void write(char const* data, std::size_t size) + { + if (_host_buffer.size() - _host_offset < size) { // Not enough space left in the bounce buffer + flush(); + assert(_host_offset == 0); + } + if (_host_buffer.size() < size) { + // If still not enough space, we just copy the data to the device. This only happens when + // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance + // optimize for this case. + write_to_device(data, size); + } else if (size > 0) { + std::memcpy(_host_buffer.get(_host_offset), data, size); + _host_offset += size; + } + } +}; + +} // namespace + +HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {} + +std::string HttpEndpoint::str() const { return _url; } + void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); } void S3Endpoint::setopt(CurlHandle& curl) @@ -42,6 +142,114 @@ void S3Endpoint::setopt(CurlHandle& curl) curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str()); } +std::string S3Endpoint::unwrap_or_default(std::optional aws_arg, + std::string const& env_var, + std::string const& err_msg) +{ + if (aws_arg.has_value()) { return std::move(*aws_arg); } + + char const* env = std::getenv(env_var.c_str()); + if (env == nullptr) { + if (err_msg.empty()) { return std::string(); } + throw std::invalid_argument(err_msg); + } + return std::string(env); +} + +std::string S3Endpoint::url_from_bucket_and_object(std::string const& bucket_name, + std::string const& object_name, + std::optional const& aws_region, + std::optional aws_endpoint_url) +{ + auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL"); + std::stringstream ss; + if (endpoint_url.empty()) { + auto const region = + unwrap_or_default(std::move(aws_region), + "AWS_DEFAULT_REGION", + "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set."); + // We default to the official AWS url scheme. + ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name; + } else { + ss << endpoint_url << "/" << bucket_name << "/" << object_name; + } + return ss.str(); +} + +std::pair S3Endpoint::parse_s3_url(std::string const& s3_url) +{ + // Regular expression to match s3:/// + std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase}; + std::smatch matches; + if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; } + throw std::invalid_argument("Input string does not match the expected S3 URL format."); +} + +S3Endpoint::S3Endpoint(std::string url, + std::optional aws_region, + std::optional aws_access_key, + std::optional aws_secret_access_key) + : _url{std::move(url)} +{ + // Regular expression to match http[s]:// + std::regex pattern{R"(^https?://.*)", std::regex_constants::icase}; + if (!std::regex_search(_url, pattern)) { + throw std::invalid_argument("url must start with http:// or https://"); + } + + auto const region = + unwrap_or_default(std::move(aws_region), + "AWS_DEFAULT_REGION", + "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set."); + + auto const access_key = + unwrap_or_default(std::move(aws_access_key), + "AWS_ACCESS_KEY_ID", + "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set."); + + auto const secret_access_key = unwrap_or_default( + std::move(aws_secret_access_key), + "AWS_SECRET_ACCESS_KEY", + "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set."); + + // Create the CURLOPT_AWS_SIGV4 option + { + std::stringstream ss; + ss << "aws:amz:" << region << ":s3"; + _aws_sigv4 = ss.str(); + } + // Create the CURLOPT_USERPWD option + // Notice, curl uses `secret_access_key` to generate a AWS V4 signature. It is NOT included + // in the http header. See + // + { + std::stringstream ss; + ss << access_key << ":" << secret_access_key; + _aws_userpwd = ss.str(); + } +} + +S3Endpoint::S3Endpoint(std::string const& bucket_name, + std::string const& object_name, + std::optional aws_region, + std::optional aws_access_key, + std::optional aws_secret_access_key, + std::optional aws_endpoint_url) + : S3Endpoint( + url_from_bucket_and_object(bucket_name, object_name, aws_region, std::move(aws_endpoint_url)), + std::move(aws_region), + std::move(aws_access_key), + std::move(aws_secret_access_key)) +{ +} + +std::string S3Endpoint::str() const { return _url; } + +RemoteHandle::RemoteHandle(std::unique_ptr endpoint, std::size_t nbytes) + : _endpoint{std::move(endpoint)}, _nbytes{nbytes} +{ +} + RemoteHandle::RemoteHandle(std::unique_ptr endpoint) { auto curl = create_curl_handle(); @@ -60,6 +268,10 @@ RemoteHandle::RemoteHandle(std::unique_ptr endpoint) _endpoint = std::move(endpoint); } +std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; } + +RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; } + namespace { /** @@ -74,7 +286,7 @@ struct CallbackContext { : buf{static_cast(buf)}, size{size}, offset{0}, overflow_error{0} { } - detail::BounceBufferH2D* bounce_buffer{nullptr}; // Only used by callback_device_memory + BounceBufferH2D* bounce_buffer{nullptr}; // Only used by callback_device_memory }; /** @@ -166,7 +378,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off PushAndPopContext c(get_context_from_pointer(buf)); // We use a bounce buffer to avoid many small memory copies to device. Libcurl has a // maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller. - detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf); + BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf); ctx.bounce_buffer = &bounce_buffer; curl.perform(); } diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp index ab9afbf648..314fb5382a 100644 --- a/cpp/src/shim/utils.cpp +++ b/cpp/src/shim/utils.cpp @@ -45,7 +45,7 @@ void* load_library(const std::vector& names, int mode) throw std::runtime_error("cannot open shared object file, tried: " + ss.str()); } -[[nodiscard]] bool is_running_in_wsl() +bool is_running_in_wsl() { struct utsname buf {}; int err = ::uname(&buf); @@ -57,7 +57,7 @@ void* load_library(const std::vector& names, int mode) return false; } -[[nodiscard]] bool run_udev_readable() +bool run_udev_readable() { try { return std::filesystem::is_directory("/run/udev"); diff --git a/cpp/src/stream.cpp b/cpp/src/stream.cpp new file mode 100644 index 0000000000..ac0f8138e4 --- /dev/null +++ b/cpp/src/stream.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace kvikio { + +StreamFuture::StreamFuture( + void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream) + : _devPtr_base{devPtr_base}, _stream{stream} +{ + // Notice, we allocate the arguments using malloc() as specified in the cuFile docs: + // + if ((_val = static_cast(std::malloc(sizeof(ArgByVal)))) == nullptr) { + throw std::bad_alloc{}; + } + *_val = { + .size = size, .file_offset = file_offset, .devPtr_offset = devPtr_offset, .bytes_done = 0}; +} + +StreamFuture::StreamFuture(StreamFuture&& o) noexcept + : _devPtr_base{std::exchange(o._devPtr_base, nullptr)}, + _stream{std::exchange(o._stream, nullptr)}, + _val{std::exchange(o._val, nullptr)}, + _stream_synchronized{o._stream_synchronized} +{ +} + +StreamFuture& StreamFuture::operator=(StreamFuture&& o) noexcept +{ + _devPtr_base = std::exchange(o._devPtr_base, nullptr); + _stream = std::exchange(o._stream, nullptr); + _val = std::exchange(o._val, nullptr); + _stream_synchronized = o._stream_synchronized; + return *this; +} + +std::tuple StreamFuture::get_args() const +{ + if (_val == nullptr) { + throw kvikio::CUfileException("cannot get arguments from an uninitialized StreamFuture"); + } + return {_devPtr_base, + &_val->size, + &_val->file_offset, + &_val->devPtr_offset, + &_val->bytes_done, + _stream}; +} + +std::size_t StreamFuture::check_bytes_done() +{ + if (_val == nullptr) { + throw kvikio::CUfileException("cannot check bytes done on an uninitialized StreamFuture"); + } + + if (!_stream_synchronized) { + _stream_synchronized = true; + CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream)); + } + + CUFILE_CHECK_BYTES_DONE(_val->bytes_done); + // At this point, we know `_val->bytes_done` is a positive value otherwise + // CUFILE_CHECK_BYTES_DONE() would have raised an exception. + return static_cast(_val->bytes_done); +} + +StreamFuture::~StreamFuture() noexcept +{ + if (_val != nullptr) { + try { + check_bytes_done(); + } catch (const kvikio::CUfileException& e) { + std::cerr << e.what() << std::endl; + } + std::free(_val); + } +} + +} // namespace kvikio diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp index 32834cf3a4..4ba5a757a2 100644 --- a/cpp/src/utils.cpp +++ b/cpp/src/utils.cpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#pragma once #include #include @@ -163,9 +162,7 @@ PushAndPopContext::~PushAndPopContext() } } -// Find the base and offset of the memory allocation `devPtr` is in -std::tuple get_alloc_info(const void* devPtr, - CUcontext* ctx = nullptr) +std::tuple get_alloc_info(const void* devPtr, CUcontext* ctx) { auto dev = convert_void2deviceptr(devPtr); CUdeviceptr base_ptr{}; From 421c46d5a49dbe8fb8aba9385cdd649faba0dec2 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Jan 2025 09:16:53 -0500 Subject: [PATCH 5/8] Minor fixes --- cpp/include/kvikio/defaults.hpp | 2 +- cpp/include/kvikio/remote_handle.hpp | 2 +- cpp/include/kvikio/shim/cuda.hpp | 1 - cpp/include/kvikio/utils.hpp | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp index 4c87724445..c65b1fa0bf 100644 --- a/cpp/include/kvikio/defaults.hpp +++ b/cpp/include/kvikio/defaults.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp index ff0741ae3f..17db472825 100644 --- a/cpp/include/kvikio/remote_handle.hpp +++ b/cpp/include/kvikio/remote_handle.hpp @@ -226,7 +226,7 @@ class RemoteHandle { * * @return The number of bytes. */ - std::size_t nbytes() const noexcept; + [[nodiscard]] std::size_t nbytes() const noexcept; /** * @brief Get a const reference to the underlying remote endpoint. diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp index f868d40f58..9aaac08827 100644 --- a/cpp/include/kvikio/shim/cuda.hpp +++ b/cpp/include/kvikio/shim/cuda.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 5593c6c9de..09d10cbcae 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -150,7 +150,7 @@ std::tuple get_alloc_info(const void* devPtr, CUcontext* ctx = nullptr); template -inline bool is_future_done(const T& future) +bool is_future_done(const T& future) { return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout; } From 07ad349b66012766cbc8d961503f5f1110886cea Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Jan 2025 10:29:27 -0500 Subject: [PATCH 6/8] Fix the preamble date --- cpp/include/kvikio/cufile/config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp index 7dd9ee7bcb..5d48106c9b 100644 --- a/cpp/include/kvikio/cufile/config.hpp +++ b/cpp/include/kvikio/cufile/config.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 688ceed59e9f93445b58cb776fc86aeba623a6b4 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 9 Jan 2025 12:17:20 -0500 Subject: [PATCH 7/8] Remove inline. Move getenv_or outside the detail namespace. Add noexcept to dtor --- cpp/include/kvikio/cufile/driver.hpp | 2 +- cpp/include/kvikio/defaults.hpp | 4 ++-- cpp/include/kvikio/utils.hpp | 2 +- cpp/src/cufile/config.cpp | 2 +- cpp/src/cufile/driver.cpp | 6 +++--- cpp/src/defaults.cpp | 20 ++++++++++---------- cpp/src/remote_handle.cpp | 10 ++-------- 7 files changed, 20 insertions(+), 26 deletions(-) diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp index 4a33289bda..56a6e8159b 100644 --- a/cpp/include/kvikio/cufile/driver.hpp +++ b/cpp/include/kvikio/cufile/driver.hpp @@ -34,7 +34,7 @@ class DriverInitializer { DriverInitializer(DriverInitializer&&) noexcept = delete; DriverInitializer& operator=(DriverInitializer&&) noexcept = delete; - ~DriverInitializer(); + ~DriverInitializer() noexcept; }; class DriverProperties { diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp index c65b1fa0bf..ad8b5a3e40 100644 --- a/cpp/include/kvikio/defaults.hpp +++ b/cpp/include/kvikio/defaults.hpp @@ -57,6 +57,8 @@ namespace detail { */ CompatMode parse_compat_mode_str(std::string_view compat_mode_str); +} // namespace detail + template T getenv_or(std::string_view env_var_name, T default_val) { @@ -79,8 +81,6 @@ bool getenv_or(std::string_view env_var_name, bool default_val); template <> CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val); -} // namespace detail - /** * @brief Singleton class of default values used throughout KvikIO. * diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 09d10cbcae..5bea6f17c7 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -93,7 +93,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; } * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices * @return Primary CUDA context */ -[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal); +[[nodiscard]] KVIKIO_EXPORT CUcontext get_primary_cuda_context(int ordinal); /** * @brief Return the CUDA context associated the given device pointer, if any. diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp index 2abbf33e92..b27475b8da 100644 --- a/cpp/src/cufile/config.cpp +++ b/cpp/src/cufile/config.cpp @@ -23,7 +23,7 @@ namespace kvikio { namespace { -[[nodiscard]] inline const char* lookup_config_path() +[[nodiscard]] const char* lookup_config_path() { const char* env = std::getenv("CUFILE_ENV_PATH_JSON"); if (env != nullptr && std::filesystem::exists(env)) { return env; } diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp index 13a23f547c..ffee213c00 100644 --- a/cpp/src/cufile/driver.cpp +++ b/cpp/src/cufile/driver.cpp @@ -25,12 +25,12 @@ namespace kvikio { namespace { -[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept +[[nodiscard]] bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept { return (prop & (1U << flag)) != 0; } -inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept +void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept { if (val) { prop |= (1U << flag); @@ -44,7 +44,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe DriverInitializer::DriverInitializer() { cuFileAPI::instance().driver_open(); } -DriverInitializer::~DriverInitializer() +DriverInitializer::~DriverInitializer() noexcept { try { cuFileAPI::instance().driver_close(); diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp index f249a8b361..affcac6be3 100644 --- a/cpp/src/defaults.cpp +++ b/cpp/src/defaults.cpp @@ -49,8 +49,10 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str) return res; } +} // namespace detail + template <> -inline bool getenv_or(std::string_view env_var_name, bool default_val) +bool getenv_or(std::string_view env_var_name, bool default_val) { const auto* env_val = std::getenv(env_var_name.data()); if (env_val == nullptr) { return default_val; } @@ -82,18 +84,16 @@ inline bool getenv_or(std::string_view env_var_name, bool default_val) } template <> -inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val) +CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val) { auto* env_val = std::getenv(env_var_name.data()); if (env_val == nullptr) { return default_val; } - return parse_compat_mode_str(env_val); + return detail::parse_compat_mode_str(env_val); } -} // namespace detail - unsigned int defaults::get_num_threads_from_env() { - const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1); + const int ret = getenv_or("KVIKIO_NTHREADS", 1); if (ret <= 0) { throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero"); } @@ -104,11 +104,11 @@ defaults::defaults() { // Determine the default value of `compat_mode` { - _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO); + _compat_mode = getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO); } // Determine the default value of `task_size` { - const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024); + const ssize_t env = getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024); if (env <= 0) { throw std::invalid_argument( "KVIKIO_TASK_SIZE has to be a positive integer greater than zero"); @@ -117,7 +117,7 @@ defaults::defaults() } // Determine the default value of `gds_threshold` { - const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024); + const ssize_t env = getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024); if (env < 0) { throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer"); } @@ -125,7 +125,7 @@ defaults::defaults() } // Determine the default value of `bounce_buffer_size` { - const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024); + const ssize_t env = getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024); if (env <= 0) { throw std::invalid_argument( "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero"); diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index 9fd0690891..8ca04f94ed 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -299,10 +299,7 @@ struct CallbackContext { * @param nmemb Size of the data in `nmemb`. * @param context A pointer to an instance of `CallbackContext`. */ -inline std::size_t callback_host_memory(char* data, - std::size_t size, - std::size_t nmemb, - void* context) +std::size_t callback_host_memory(char* data, std::size_t size, std::size_t nmemb, void* context) { auto ctx = reinterpret_cast(context); std::size_t const nbytes = size * nmemb; @@ -326,10 +323,7 @@ inline std::size_t callback_host_memory(char* data, * @param nmemb Size of the data in `nmemb`. * @param context A pointer to an instance of `CallbackContext`. */ -inline std::size_t callback_device_memory(char* data, - std::size_t size, - std::size_t nmemb, - void* context) +std::size_t callback_device_memory(char* data, std::size_t size, std::size_t nmemb, void* context) { auto ctx = reinterpret_cast(context); std::size_t const nbytes = size * nmemb; From 2700acedf86d9b01b0feafc75466bf7259073c71 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Jan 2025 15:17:44 -0500 Subject: [PATCH 8/8] Improve code quality of error.hpp --- cpp/include/kvikio/error.hpp | 106 ++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp index 2ecd37b0b3..33a4730b79 100644 --- a/cpp/include/kvikio/error.hpp +++ b/cpp/include/kvikio/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,26 +33,9 @@ struct CUfileException : public std::runtime_error { GET_CUDA_DRIVER_TRY_MACRO(__VA_ARGS__, CUDA_DRIVER_TRY_2, CUDA_DRIVER_TRY_1) \ (__VA_ARGS__) #define GET_CUDA_DRIVER_TRY_MACRO(_1, _2, NAME, ...) NAME -#define CUDA_DRIVER_TRY_2(_call, _exception_type) \ - do { \ - CUresult const error = (_call); \ - if (error == CUDA_ERROR_STUB_LIBRARY) { \ - throw(_exception_type){std::string{"CUDA error at: "} + __FILE__ + ":" + \ - KVIKIO_STRINGIFY(__LINE__) + \ - ": CUDA_ERROR_STUB_LIBRARY(" \ - "The CUDA driver loaded is a stub library)"}; \ - } \ - if (error != CUDA_SUCCESS) { \ - const char* err_name = nullptr; \ - const char* err_str = nullptr; \ - CUresult err_name_status = kvikio::cudaAPI::instance().GetErrorName(error, &err_name); \ - CUresult err_str_status = kvikio::cudaAPI::instance().GetErrorString(error, &err_str); \ - if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } \ - if (err_str_status == CUDA_ERROR_INVALID_VALUE) { err_str = "unknown"; } \ - throw(_exception_type){std::string{"CUDA error at: "} + __FILE__ + ":" + \ - KVIKIO_STRINGIFY(__LINE__) + ": " + std::string(err_name) + "(" + \ - std::string(err_str) + ")"}; \ - } \ +#define CUDA_DRIVER_TRY_2(_call, _exception_type) \ + do { \ + kvikio::detail::cuda_driver_try_2<_exception_type>(_call, __LINE__, __FILE__); \ } while (0) #define CUDA_DRIVER_TRY_1(_call) CUDA_DRIVER_TRY_2(_call, kvikio::CUfileException) #endif @@ -62,18 +45,9 @@ struct CUfileException : public std::runtime_error { GET_CUFILE_TRY_MACRO(__VA_ARGS__, CUFILE_TRY_2, CUFILE_TRY_1) \ (__VA_ARGS__) #define GET_CUFILE_TRY_MACRO(_1, _2, NAME, ...) NAME -#define CUFILE_TRY_2(_call, _exception_type) \ - do { \ - CUfileError_t const error = (_call); \ - if (error.err != CU_FILE_SUCCESS) { \ - if (error.err == CU_FILE_CUDA_DRIVER_ERROR) { \ - CUresult const cuda_error = error.cu_err; \ - CUDA_DRIVER_TRY(cuda_error); \ - } \ - throw(_exception_type){std::string{"cuFile error at: "} + __FILE__ + ":" + \ - KVIKIO_STRINGIFY(__LINE__) + ": " + \ - cufileop_status_error((CUfileOpError)std::abs(error.err))}; \ - } \ +#define CUFILE_TRY_2(_call, _exception_type) \ + do { \ + kvikio::detail::cufile_try_2<_exception_type>(_call, __LINE__, __FILE__); \ } while (0) #define CUFILE_TRY_1(_call) CUFILE_TRY_2(_call, kvikio::CUfileException) #endif @@ -84,19 +58,63 @@ struct CUfileException : public std::runtime_error { __VA_ARGS__, CUFILE_CHECK_BYTES_DONE_2, CUFILE_CHECK_BYTES_DONE_1) \ (__VA_ARGS__) #define GET_CUFILE_CHECK_BYTES_DONE_MACRO(_1, _2, NAME, ...) NAME -#define CUFILE_CHECK_BYTES_DONE_2(_nbytes_done, _exception_type) \ - do { \ - auto const _nbytes = (_nbytes_done); \ - if (_nbytes < 0) { \ - auto const err = std::abs(_nbytes); \ - auto const msg = (err > CUFILEOP_BASE_ERR) \ - ? std::string(cufileop_status_error((CUfileOpError)err)) \ - : std::string(std::strerror(err)); \ - throw(_exception_type){std::string{"cuFile error at: "} + __FILE__ + ":" + \ - KVIKIO_STRINGIFY(__LINE__) + ": " + msg}; \ - } \ +#define CUFILE_CHECK_BYTES_DONE_2(_nbytes_done, _exception_type) \ + do { \ + kvikio::detail::cufile_check_bytes_done_2<_exception_type>(_nbytes_done, __LINE__, __FILE__); \ } while (0) #define CUFILE_CHECK_BYTES_DONE_1(_call) CUFILE_CHECK_BYTES_DONE_2(_call, kvikio::CUfileException) #endif +namespace detail { +template +void cuda_driver_try_2(CUresult error, int line_number, const char* filename) +{ + if (error == CUDA_ERROR_STUB_LIBRARY) { + throw Exception{std::string{"CUDA error at: "} + std::string(filename) + ":" + + KVIKIO_STRINGIFY(line_number) + + ": CUDA_ERROR_STUB_LIBRARY(" + "The CUDA driver loaded is a stub library)"}; + } + if (error != CUDA_SUCCESS) { + const char* err_name = nullptr; + const char* err_str = nullptr; + CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name); + CUresult err_str_status = cudaAPI::instance().GetErrorString(error, &err_str); + if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } + if (err_str_status == CUDA_ERROR_INVALID_VALUE) { err_str = "unknown"; } + throw Exception{std::string{"CUDA error at: "} + filename + ":" + + KVIKIO_STRINGIFY(line_number) + ": " + std::string(err_name) + "(" + + std::string(err_str) + ")"}; + } +} + +template +void cufile_try_2(CUfileError_t error, int line_number, const char* filename) +{ + if (error.err != CU_FILE_SUCCESS) { + if (error.err == CU_FILE_CUDA_DRIVER_ERROR) { + CUresult const cuda_error = error.cu_err; + CUDA_DRIVER_TRY(cuda_error); + } + throw Exception{std::string{"cuFile error at: "} + filename + ":" + + KVIKIO_STRINGIFY(line_number) + ": " + + cufileop_status_error((CUfileOpError)std::abs(error.err))}; + } +} + +template +void cufile_check_bytes_done_2(ssize_t nbytes_done, int line_number, const char* filename) +{ + if (nbytes_done < 0) { + auto const err = std::abs(nbytes_done); + auto const msg = (err > CUFILEOP_BASE_ERR) + ? std::string(cufileop_status_error((CUfileOpError)err)) + : std::string(std::strerror(err)); + throw Exception{std::string{"cuFile error at: "} + filename + ":" + + KVIKIO_STRINGIFY(line_number) + ": " + msg}; + } +} + +} // namespace detail + } // namespace kvikio