From 718b078f219d409016b21bc0318e19c233f45653 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 7 Jan 2025 16:26:31 -0500
Subject: [PATCH 1/8] Separate implementation from interface

---
 cpp/CMakeLists.txt                   |   6 +-
 cpp/include/kvikio/cufile/config.hpp |  23 +--
 cpp/include/kvikio/cufile/driver.hpp | 232 +++----------------------
 cpp/include/kvikio/shim/cuda.hpp     |  50 +-----
 cpp/include/kvikio/shim/cufile.hpp   | 127 ++------------
 cpp/include/kvikio/shim/libcurl.hpp  | 106 ++----------
 cpp/include/kvikio/shim/utils.hpp    |  48 +----
 cpp/src/cufile/config.cpp            |  42 +++++
 cpp/src/cufile/driver.cpp            | 250 +++++++++++++++++++++++++++
 cpp/src/shim/cuda.cpp                |  72 ++++++++
 cpp/src/shim/cufile.cpp              | 157 +++++++++++++++++
 cpp/src/shim/libcurl.cpp             | 134 ++++++++++++++
 cpp/src/shim/utils.cpp               |  69 ++++++++
 13 files changed, 786 insertions(+), 530 deletions(-)
 create mode 100644 cpp/src/cufile/config.cpp
 create mode 100644 cpp/src/cufile/driver.cpp
 create mode 100644 cpp/src/shim/cuda.cpp
 create mode 100644 cpp/src/shim/cufile.cpp
 create mode 100644 cpp/src/shim/libcurl.cpp
 create mode 100644 cpp/src/shim/utils.cpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a0639c5382..ea7d29a06f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -131,7 +131,9 @@ include(cmake/thirdparty/get_thread_pool.cmake)
 # ##################################################################################################
 # * library targets --------------------------------------------------------------------------------
 
-set(SOURCES "src/file_handle.cpp")
+set(SOURCES "src/file_handle.cpp" "src/cufile/config.cpp" "src/cufile/driver.cpp"
+            "src/shim/cuda.cpp" "src/shim/cufile.cpp" "src/shim/libcurl.cpp" "src/shim/utils.cpp"
+)
 
 if(KvikIO_REMOTE_SUPPORT)
   list(APPEND SOURCES "src/remote_handle.cpp")
diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp
index b1457880bb..7dd9ee7bcb 100644
--- a/cpp/include/kvikio/cufile/config.hpp
+++ b/cpp/include/kvikio/cufile/config.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,24 +15,9 @@
  */
 #pragma once
 
-#include <cstdlib>
-#include <filesystem>
-#include <string>
-
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
-namespace detail {
-
-[[nodiscard]] inline const char* lookup_config_path()
-{
-  const char* env = std::getenv("CUFILE_ENV_PATH_JSON");
-  if (env != nullptr && std::filesystem::exists(env)) { return env; }
-  if (std::filesystem::exists("/etc/cufile.json")) { return "/etc/cufile.json"; }
-  return "";
-}
-
-}  // namespace detail
 
 /**
  * @brief Get the filepath to cuFile's config file (`cufile.json`) or the empty string
@@ -41,10 +26,6 @@ namespace detail {
  *
  * @return The filepath to the cufile.json file or the empty string if it isn't found.
  */
-[[nodiscard]] KVIKIO_EXPORT inline const std::string& config_path()
-{
-  static const std::string ret = detail::lookup_config_path();
-  return ret;
-}
+[[nodiscard]] KVIKIO_EXPORT const std::string& config_path();
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
index b609029a69..269761c75d 100644
--- a/cpp/include/kvikio/cufile/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,53 +15,24 @@
  */
 #pragma once
 
-#include <iostream>
 #include <vector>
 
-#include <kvikio/error.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 
 namespace kvikio {
-namespace detail {
-
-[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept
-{
-  return (prop & (1U << flag)) != 0;
-}
-
-inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept
-{
-  if (val) {
-    prop |= (1U << flag);
-  } else {
-    prop &= ~(1U << flag);
-  }
-}
-}  // namespace detail
-
-#ifdef KVIKIO_CUFILE_FOUND
 
 class DriverInitializer {
   // Optional, if not used cuFiles opens the driver automatically
  public:
-  DriverInitializer() { cuFileAPI::instance().driver_open(); }
+  DriverInitializer();
 
   DriverInitializer(DriverInitializer const&)                = delete;
   DriverInitializer& operator=(DriverInitializer const&)     = delete;
   DriverInitializer(DriverInitializer&&) noexcept            = delete;
   DriverInitializer& operator=(DriverInitializer&&) noexcept = delete;
 
-  ~DriverInitializer()
-  {
-    try {
-      cuFileAPI::instance().driver_close();
-    } catch (const CUfileException& e) {
-      std::cerr << "Unable to close GDS file driver: ";
-      std::cerr << e.what();
-      std::cerr << std::endl;
-    }
-  }
+  ~DriverInitializer();
 };
 
 class DriverProperties {
@@ -71,204 +42,45 @@ class DriverProperties {
 
   // Because Cython does not handle exceptions in the default
   // constructor, we initialize `_props` lazily.
-  void lazy_init()
-  {
-    if (_initialized) { return; }
-    _initialized = true;
-    CUFILE_TRY(cuFileAPI::instance().DriverGetProperties(&_props));
-  }
+  void lazy_init();
 
  public:
+#ifdef KVIKIO_CUFILE_FOUND
   DriverProperties() = default;
-
-  bool is_gds_available()
-  {
-    // If both the major and minor version is zero, the GDS driver isn't loaded.
-    return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0);
-  }
-
-  [[nodiscard]] unsigned int get_nvfs_major_version()
-  {
-    lazy_init();
-    return _props.nvfs.major_version;
-  }
-
-  [[nodiscard]] unsigned int get_nvfs_minor_version()
-  {
-    lazy_init();
-    return _props.nvfs.minor_version;
-  }
-
-  [[nodiscard]] bool get_nvfs_allow_compat_mode()
-  {
-    lazy_init();
-    return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE);
-  }
-
-  [[nodiscard]] bool get_nvfs_poll_mode()
-  {
-    lazy_init();
-    return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE);
-  }
-
-  [[nodiscard]] std::size_t get_nvfs_poll_thresh_size()
-  {
-    lazy_init();
-    return _props.nvfs.poll_thresh_size;
-  }
-
-  void set_nvfs_poll_mode(bool enable)
-  {
-    lazy_init();
-    CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size()));
-    detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable);
-  }
-
-  void set_nvfs_poll_thresh_size(std::size_t size_in_kb)
-  {
-    lazy_init();
-    CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(get_nvfs_poll_mode(), size_in_kb));
-    _props.nvfs.poll_thresh_size = size_in_kb;
-  }
-
-  [[nodiscard]] std::vector<CUfileDriverControlFlags> get_nvfs_statusflags()
-  {
-    lazy_init();
-    std::vector<CUfileDriverControlFlags> ret;
-    if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) {
-      ret.push_back(CU_FILE_USE_POLL_MODE);
-    }
-    if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) {
-      ret.push_back(CU_FILE_ALLOW_COMPAT_MODE);
-    }
-    return ret;
-  }
-
-  [[nodiscard]] std::size_t get_max_device_cache_size()
-  {
-    lazy_init();
-    return _props.max_device_cache_size;
-  }
-
-  void set_max_device_cache_size(std::size_t size_in_kb)
-  {
-    lazy_init();
-    CUFILE_TRY(cuFileAPI::instance().DriverSetMaxCacheSize(size_in_kb));
-    _props.max_device_cache_size = size_in_kb;
-  }
-
-  [[nodiscard]] std::size_t get_per_buffer_cache_size()
-  {
-    lazy_init();
-    return _props.per_buffer_cache_size;
-  }
-
-  [[nodiscard]] std::size_t get_max_pinned_memory_size()
-  {
-    lazy_init();
-    return _props.max_device_pinned_mem_size;
-  }
-
-  void set_max_pinned_memory_size(std::size_t size_in_kb)
-  {
-    lazy_init();
-    CUFILE_TRY(cuFileAPI::instance().DriverSetMaxPinnedMemSize(size_in_kb));
-    _props.max_device_pinned_mem_size = size_in_kb;
-  }
-
-  [[nodiscard]] std::size_t get_max_batch_io_size()
-  {
-#ifdef KVIKIO_CUFILE_BATCH_API_FOUND
-    lazy_init();
-    return _props.max_batch_io_size;
-#else
-    return 0;
-#endif
-  }
-};
-
 #else
-struct DriverInitializer {
   // Implement a non-default constructor to avoid `unused variable` warnings downstream
-  DriverInitializer() {}
-};
-
-struct DriverProperties {
-  // Implement a non-default constructor to avoid `unused variable` warnings downstream
-  DriverProperties() {}
+  DriverProperties();
+#endif
 
-  static bool is_gds_available() { return false; }
+  bool is_gds_available();
 
-  [[nodiscard]] static unsigned int get_nvfs_major_version()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] unsigned int get_nvfs_major_version();
 
-  [[nodiscard]] static unsigned int get_nvfs_minor_version()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] unsigned int get_nvfs_minor_version();
 
-  [[nodiscard]] static bool get_nvfs_allow_compat_mode()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] bool get_nvfs_allow_compat_mode();
 
-  [[nodiscard]] static bool get_nvfs_poll_mode()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] bool get_nvfs_poll_mode();
 
-  [[nodiscard]] static std::size_t get_nvfs_poll_thresh_size()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::size_t get_nvfs_poll_thresh_size();
 
-  static void set_nvfs_poll_mode(bool enable)
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  void set_nvfs_poll_mode(bool enable);
 
-  static void set_nvfs_poll_thresh_size(std::size_t size_in_kb)
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  void set_nvfs_poll_thresh_size(std::size_t size_in_kb);
 
-  [[nodiscard]] static std::vector<CUfileDriverControlFlags> get_nvfs_statusflags()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::vector<CUfileDriverControlFlags> get_nvfs_statusflags();
 
-  [[nodiscard]] static std::size_t get_max_device_cache_size()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::size_t get_max_device_cache_size();
 
-  static void set_max_device_cache_size(std::size_t size_in_kb)
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  void set_max_device_cache_size(std::size_t size_in_kb);
 
-  [[nodiscard]] static std::size_t get_per_buffer_cache_size()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::size_t get_per_buffer_cache_size();
 
-  [[nodiscard]] static std::size_t get_max_pinned_memory_size()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::size_t get_max_pinned_memory_size();
 
-  static void set_max_pinned_memory_size(std::size_t size_in_kb)
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  void set_max_pinned_memory_size(std::size_t size_in_kb);
 
-  [[nodiscard]] std::size_t get_max_batch_io_size()
-  {
-    throw CUfileException("KvikIO not compiled with cuFile.h");
-  }
+  [[nodiscard]] std::size_t get_max_batch_io_size();
 };
-#endif
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 606a618736..f868d40f58 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <kvikio/shim/cuda.hpp>
 #include <kvikio/shim/cuda_h_wrapper.hpp>
 #include <kvikio/shim/utils.hpp>
 
@@ -50,46 +51,13 @@ class cudaAPI {
   decltype(cuStreamDestroy)* StreamDestroy{nullptr};
 
  private:
-#ifdef KVIKIO_CUDA_FOUND
-  cudaAPI()
-  {
-    void* lib = load_library("libcuda.so.1");
-    // Notice, the API version loaded must match the version used downstream. That is,
-    // if a project uses the `_v2` CUDA Driver API or the newest Runtime API, the symbols
-    // loaded should also be the `_v2` symbols. Thus, we use KVIKIO_STRINGIFY() to get
-    // the name of the symbol through cude.h.
-    get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc));
-    get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost));
-    get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync));
-    get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync));
-    get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute));
-    get_symbol(PointerGetAttributes, lib, KVIKIO_STRINGIFY(cuPointerGetAttributes));
-    get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent));
-    get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent));
-    get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent));
-    get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
-    get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
-    get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
-    get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
-    get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
-    get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
-    get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize));
-    get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate));
-    get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy));
-  }
-#else
-  cudaAPI() { throw std::runtime_error("KvikIO not compiled with CUDA support"); }
-#endif
+  cudaAPI();
 
  public:
   cudaAPI(cudaAPI const&)        = delete;
   void operator=(cudaAPI const&) = delete;
 
-  KVIKIO_EXPORT static cudaAPI& instance()
-  {
-    static cudaAPI _instance;
-    return _instance;
-  }
+  KVIKIO_EXPORT static cudaAPI& instance();
 };
 
 /**
@@ -100,15 +68,7 @@ class cudaAPI {
  * @return The boolean answer
  */
 #ifdef KVIKIO_CUDA_FOUND
-inline bool is_cuda_available()
-{
-  try {
-    cudaAPI::instance();
-  } catch (const std::runtime_error&) {
-    return false;
-  }
-  return true;
-}
+bool is_cuda_available();
 #else
 constexpr bool is_cuda_available() { return false; }
 #endif
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 5194d45e74..c90fba1fce 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,6 @@
  */
 #pragma once
 
-#include <stdexcept>
-#include <string>
-
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/shim/utils.hpp>
 
@@ -64,79 +61,15 @@ class cuFileAPI {
   int version{0};
 
  private:
-#ifdef KVIKIO_CUFILE_FOUND
-  cuFileAPI()
-  {
-    // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is
-    // a workaround that adds support for all prior versions of libcufile.
-    void* lib = load_library({"libcufile.so.0",
-                              "libcufile.so.1.3.0" /* 11.7.0 */,
-                              "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */,
-                              "libcufile.so.1.2.0" /* 11.6.0 */,
-                              "libcufile.so.1.1.1" /* 11.5.1 */,
-                              "libcufile.so.1.1.0" /* 11.5.0 */,
-                              "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */,
-                              "libcufile.so.1.0.1" /* 11.4.1 */,
-                              "libcufile.so.1.0.0" /* 11.4.0 */});
-    get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister));
-    get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister));
-    get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead));
-    get_symbol(Write, lib, KVIKIO_STRINGIFY(cuFileWrite));
-    get_symbol(BufRegister, lib, KVIKIO_STRINGIFY(cuFileBufRegister));
-    get_symbol(BufDeregister, lib, KVIKIO_STRINGIFY(cuFileBufDeregister));
-    get_symbol(DriverOpen, lib, KVIKIO_STRINGIFY(cuFileDriverOpen));
-    get_symbol(DriverClose, lib, KVIKIO_STRINGIFY(cuFileDriverClose));
-    get_symbol(DriverGetProperties, lib, KVIKIO_STRINGIFY(cuFileDriverGetProperties));
-    get_symbol(DriverSetPollMode, lib, KVIKIO_STRINGIFY(cuFileDriverSetPollMode));
-    get_symbol(DriverSetMaxCacheSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxCacheSize));
-    get_symbol(DriverSetMaxPinnedMemSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxPinnedMemSize));
-
-#ifdef KVIKIO_CUFILE_VERSION_API_FOUND
-    try {
-      get_symbol(GetVersion, lib, KVIKIO_STRINGIFY(cuFileGetVersion));
-      int ver;
-      CUfileError_t const error = GetVersion(&ver);
-      if (error.err == CU_FILE_SUCCESS) { version = ver; }
-    } catch (std::runtime_error const&) {
-    }
-#endif
-
-    // Some symbols were introduced in later versions, so version guards are required.
-    // Note: `version` is 0 for cuFile versions prior to v1.8 because `cuFileGetVersion`
-    // did not exist. As a result, the batch and stream APIs are not loaded in versions
-    // 1.6 and 1.7, respectively, even though they are available. This trade-off is made
-    // for improved robustness.
-    if (version >= 1060) {
-      get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp));
-      get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit));
-      get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus));
-      get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel));
-      get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy));
-    }
-    if (version >= 1070) {
-      get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync));
-      get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync));
-      get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
-      get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
-    }
-
-    // cuFile is supposed to open and close the driver automatically but
-    // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults:
-    // <https://github.com/rapidsai/kvikio/issues/159>.
-    if (version < 1050) { driver_open(); }
-  }
+  cuFileAPI();
 
+#ifdef KVIKIO_CUFILE_FOUND
   // Notice, we have to close the driver at program exit (if we opened it) even though we are
   // not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
   // driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
   // this seems to be the lesser of two evils.
   // [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-  ~cuFileAPI()
-  {
-    if (version < 1050) { driver_close(); }
-  }
-#else
-  cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); }
+  ~cuFileAPI();
 #endif
 
  public:
@@ -145,11 +78,7 @@ class cuFileAPI {
   cuFileAPI(cuFileAPI const&&)      = delete;
   void operator=(cuFileAPI const&&) = delete;
 
-  KVIKIO_EXPORT static cuFileAPI& instance()
-  {
-    static cuFileAPI _instance;
-    return _instance;
-  }
+  KVIKIO_EXPORT static cuFileAPI& instance();
 
   /**
    * @brief Open the cuFile driver
@@ -157,26 +86,12 @@ class cuFileAPI {
    * cuFile allows multiple calls to `cufileDriverOpen()`, only the first call opens
    * the driver, but every call should have a matching call to `cufileDriverClose()`.
    */
-  void driver_open()
-  {
-    CUfileError_t const error = DriverOpen();
-    if (error.err != CU_FILE_SUCCESS) {
-      throw std::runtime_error(std::string{"Unable to open GDS file driver: "} +
-                               cufileop_status_error(error.err));
-    }
-  }
+  void driver_open();
 
   /**
    * @brief Close the cuFile driver
    */
-  void driver_close()
-  {
-    CUfileError_t const error = DriverClose();
-    if (error.err != CU_FILE_SUCCESS) {
-      throw std::runtime_error(std::string{"Unable to close GDS file driver: "} +
-                               cufileop_status_error(error.err));
-    }
-  }
+  void driver_close();
 };
 
 /**
@@ -187,15 +102,7 @@ class cuFileAPI {
  * @return The boolean answer
  */
 #ifdef KVIKIO_CUFILE_FOUND
-inline bool is_cufile_library_available()
-{
-  try {
-    cuFileAPI::instance();
-  } catch (const std::runtime_error&) {
-    return false;
-  }
-  return true;
-}
+bool is_cufile_library_available();
 #else
 constexpr bool is_cufile_library_available() { return false; }
 #endif
@@ -208,10 +115,7 @@ constexpr bool is_cufile_library_available() { return false; }
  *
  * @return The boolean answer
  */
-inline bool is_cufile_available()
-{
-  return is_cufile_library_available() && run_udev_readable() && !is_running_in_wsl();
-}
+bool is_cufile_available();
 
 /**
  * @brief Get cufile version (or zero if older than v1.8).
@@ -225,14 +129,7 @@ inline bool is_cufile_available()
  * @return The version (1000*major + 10*minor) or zero if older than 1080.
  */
 #ifdef KVIKIO_CUFILE_FOUND
-inline int cufile_version()
-{
-  try {
-    return cuFileAPI::instance().version;
-  } catch (std::runtime_error const&) {
-    return 0;
-  }
-}
+int cufile_version();
 #else
 constexpr int cufile_version() { return 0; }
 #endif
@@ -246,7 +143,7 @@ constexpr int cufile_version() { return 0; }
  *
  * @return The boolean answer
  */
-inline bool is_batch_api_available() noexcept { return cufile_version() >= 1060; }
+bool is_batch_api_available() noexcept;
 
 /**
  * @brief Check if cuFile's stream (async) API is available.
@@ -257,6 +154,6 @@ inline bool is_batch_api_available() noexcept { return cufile_version() >= 1060;
  *
  * @return The boolean answer
  */
-inline bool is_stream_api_available() noexcept { return cufile_version() >= 1070; }
+bool is_stream_api_available() noexcept;
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp
index 423eff9c60..a4efab02e5 100644
--- a/cpp/include/kvikio/shim/libcurl.hpp
+++ b/cpp/include/kvikio/shim/libcurl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,22 +20,14 @@
   "cannot include the remote IO API, please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
 #endif
 
-#include <cstring>
 #include <functional>
 #include <memory>
 #include <sstream>
-#include <stdexcept>
 #include <string>
 #include <vector>
 
 #include <curl/curl.h>
 
-#include <kvikio/defaults.hpp>
-#include <kvikio/error.hpp>
-#include <kvikio/parallel_operation.hpp>
-#include <kvikio/posix_io.hpp>
-#include <kvikio/utils.hpp>
-
 namespace kvikio {
 
 /**
@@ -65,72 +57,26 @@ class LibCurl {
   // Curl handles free to be used.
   std::vector<UniqueHandlePtr> _free_curl_handles{};
 
-  LibCurl()
-  {
-    CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT);
-    if (err != CURLE_OK) {
-      throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err));
-    }
-    curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW);
-    if ((ver->features & CURL_VERSION_THREADSAFE) == 0) {
-      throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled");
-    }
-  }
-  ~LibCurl() noexcept
-  {
-    _free_curl_handles.clear();
-    curl_global_cleanup();
-  }
+  LibCurl();
+  ~LibCurl() noexcept;
 
  public:
-  static LibCurl& instance()
-  {
-    static LibCurl _instance;
-    return _instance;
-  }
+  static LibCurl& instance();
 
   /**
    * @brief Returns a free curl handle if available.
    */
-  UniqueHandlePtr get_free_handle()
-  {
-    UniqueHandlePtr ret;
-    std::lock_guard const lock(_mutex);
-    if (!_free_curl_handles.empty()) {
-      ret = std::move(_free_curl_handles.back());
-      _free_curl_handles.pop_back();
-    }
-    return ret;
-  }
+  UniqueHandlePtr get_free_handle();
 
   /**
    * @brief Returns a curl handle, create a new handle if none is available.
    */
-  UniqueHandlePtr get_handle()
-  {
-    // Check if we have a free handle available.
-    UniqueHandlePtr ret = get_free_handle();
-    if (ret) {
-      curl_easy_reset(ret.get());
-    } else {
-      // If not, we create a new handle.
-      CURL* raw_handle = curl_easy_init();
-      if (raw_handle == nullptr) {
-        throw std::runtime_error("libcurl: call to curl_easy_init() failed");
-      }
-      ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup);
-    }
-    return ret;
-  }
+  UniqueHandlePtr get_handle();
 
   /**
    * @brief Retain a curl handle for later use.
    */
-  void retain_handle(UniqueHandlePtr handle)
-  {
-    std::lock_guard const lock(_mutex);
-    _free_curl_handles.push_back(std::move(handle));
-  }
+  void retain_handle(UniqueHandlePtr handle);
 };
 
 /**
@@ -156,23 +102,8 @@ class CurlHandle {
    * @param source_file Path of source file of the caller (for error messages).
    * @param source_line Line of source file of the caller (for error messages).
    */
-  CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line)
-    : _handle{std::move(handle)},
-      _source_file(std::move(source_file)),
-      _source_line(std::move(source_line))
-  {
-    // Need CURLOPT_NOSIGNAL to support threading, see
-    // <https://curl.se/libcurl/c/CURLOPT_NOSIGNAL.html>
-    setopt(CURLOPT_NOSIGNAL, 1L);
-
-    // We always set CURLOPT_ERRORBUFFER to get better error messages.
-    _errbuf[0] = 0;  // Set the error buffer as empty.
-    setopt(CURLOPT_ERRORBUFFER, _errbuf);
-
-    // Make curl_easy_perform() fail when receiving HTTP code errors.
-    setopt(CURLOPT_FAILONERROR, 1L);
-  }
-  ~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
+  CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line);
+  ~CurlHandle() noexcept;
 
   /**
    * @brief CurlHandle support is not movable or copyable.
@@ -185,7 +116,7 @@ class CurlHandle {
   /**
    * @brief Get the underlying curl easy handle pointer.
    */
-  CURL* handle() noexcept { return _handle.get(); }
+  CURL* handle() noexcept;
 
   /**
    * @brief Set option for the curl handle.
@@ -212,22 +143,7 @@ class CurlHandle {
    *
    * See <https://curl.se/libcurl/c/curl_easy_perform.html>.
    */
-  void perform()
-  {
-    // Perform the curl operation and check for errors.
-    CURLcode err = curl_easy_perform(handle());
-    if (err != CURLE_OK) {
-      std::string msg(_errbuf);  // We can do this because we always initialize `_errbuf` as empty.
-      std::stringstream ss;
-      ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line;
-      if (msg.empty()) {
-        ss << "(" << curl_easy_strerror(err) << ")";
-      } else {
-        ss << "(" << msg << ")";
-      }
-      throw std::runtime_error(ss.str());
-    }
-  }
+  void perform();
 
   /**
    * @brief Extract information from a curl handle.
diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp
index 7a3c439899..bc47be205f 100644
--- a/cpp/include/kvikio/shim/utils.hpp
+++ b/cpp/include/kvikio/shim/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,6 @@
 
 #include <dlfcn.h>
 #include <sys/utsname.h>
-#include <filesystem>
-#include <sstream>
 #include <vector>
 
 namespace kvikio {
@@ -46,13 +44,7 @@ namespace kvikio {
  * @param name Name of the library to load.
  * @return The library handle.
  */
-inline void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE)
-{
-  ::dlerror();  // Clear old errors
-  void* ret = ::dlopen(name, mode);
-  if (ret == nullptr) { throw std::runtime_error(::dlerror()); }
-  return ret;
-}
+void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
 
 /**
  * @brief Load shared library
@@ -60,19 +52,8 @@ inline void* load_library(const char* name, int mode = RTLD_LAZY | RTLD_LOCAL |
  * @param names Vector of names to try when loading shared library.
  * @return The library handle.
  */
-inline void* load_library(const std::vector<const char*>& names,
-                          int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE)
-{
-  std::stringstream ss;
-  for (const char* name : names) {
-    ss << name << " ";
-    try {
-      return load_library(name, mode);
-    } catch (const std::runtime_error&) {
-    }
-  }
-  throw std::runtime_error("cannot open shared object file, tried: " + ss.str());
-}
+void* load_library(const std::vector<const char*>& names,
+                   int mode = RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
 
 /**
  * @brief Get symbol using `dlsym`
@@ -99,17 +80,7 @@ void get_symbol(T& handle, void* lib, const char* name)
  *
  * @return The boolean answer
  */
-[[nodiscard]] inline bool is_running_in_wsl()
-{
-  struct utsname buf {};
-  int err = ::uname(&buf);
-  if (err == 0) {
-    const std::string name(static_cast<char*>(buf.release));
-    // 'Microsoft' for WSL1 and 'microsoft' for WSL2
-    return name.find("icrosoft") != std::string::npos;
-  }
-  return false;
-}
+[[nodiscard]] bool is_running_in_wsl();
 
 /**
  * @brief Check if `/run/udev` is readable
@@ -120,13 +91,6 @@ void get_symbol(T& handle, void* lib, const char* name)
  *
  * @return The boolean answer
  */
-[[nodiscard]] inline bool run_udev_readable()
-{
-  try {
-    return std::filesystem::is_directory("/run/udev");
-  } catch (const std::filesystem::filesystem_error&) {
-    return false;
-  }
-}
+[[nodiscard]] bool run_udev_readable();
 
 }  // namespace kvikio
diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp
new file mode 100644
index 0000000000..7566c11532
--- /dev/null
+++ b/cpp/src/cufile/config.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <filesystem>
+#include <string>
+
+#include <kvikio/cufile/config.hpp>
+
+namespace kvikio {
+namespace detail {
+
+[[nodiscard]] inline const char* lookup_config_path()
+{
+  const char* env = std::getenv("CUFILE_ENV_PATH_JSON");
+  if (env != nullptr && std::filesystem::exists(env)) { return env; }
+  if (std::filesystem::exists("/etc/cufile.json")) { return "/etc/cufile.json"; }
+  return "";
+}
+
+}  // namespace detail
+
+const std::string& config_path()
+{
+  static const std::string ret = detail::lookup_config_path();
+  return ret;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
new file mode 100644
index 0000000000..959a64b33b
--- /dev/null
+++ b/cpp/src/cufile/driver.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <vector>
+
+#include <kvikio/cufile/driver.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cufile.hpp>
+#include <kvikio/shim/cufile_h_wrapper.hpp>
+
+namespace kvikio {
+namespace detail {
+
+[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept
+{
+  return (prop & (1U << flag)) != 0;
+}
+
+inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept
+{
+  if (val) {
+    prop |= (1U << flag);
+  } else {
+    prop &= ~(1U << flag);
+  }
+}
+}  // namespace detail
+
+#ifdef KVIKIO_CUFILE_FOUND
+
+DriverInitializer::DriverInitializer() { cuFileAPI::instance().driver_open(); }
+
+DriverInitializer::~DriverInitializer()
+{
+  try {
+    cuFileAPI::instance().driver_close();
+  } catch (const CUfileException& e) {
+    std::cerr << "Unable to close GDS file driver: ";
+    std::cerr << e.what();
+    std::cerr << std::endl;
+  }
+}
+
+// Because Cython does not handle exceptions in the default
+// constructor, we initialize `_props` lazily.
+void DriverProperties::lazy_init()
+{
+  if (_initialized) { return; }
+  _initialized = true;
+  CUFILE_TRY(cuFileAPI::instance().DriverGetProperties(&_props));
+}
+
+bool DriverProperties::is_gds_available()
+{
+  // If both the major and minor version is zero, the GDS driver isn't loaded.
+  return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0);
+}
+
+[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version()
+{
+  lazy_init();
+  return _props.nvfs.major_version;
+}
+
+[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version()
+{
+  lazy_init();
+  return _props.nvfs.minor_version;
+}
+
+[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode()
+{
+  lazy_init();
+  return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE);
+}
+
+[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode()
+{
+  lazy_init();
+  return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE);
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size()
+{
+  lazy_init();
+  return _props.nvfs.poll_thresh_size;
+}
+
+void DriverProperties::set_nvfs_poll_mode(bool enable)
+{
+  lazy_init();
+  CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size()));
+  detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable);
+}
+
+void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
+{
+  lazy_init();
+  CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(get_nvfs_poll_mode(), size_in_kb));
+  _props.nvfs.poll_thresh_size = size_in_kb;
+}
+
+[[nodiscard]] std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
+{
+  lazy_init();
+  std::vector<CUfileDriverControlFlags> ret;
+  if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) {
+    ret.push_back(CU_FILE_USE_POLL_MODE);
+  }
+  if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) {
+    ret.push_back(CU_FILE_ALLOW_COMPAT_MODE);
+  }
+  return ret;
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size()
+{
+  lazy_init();
+  return _props.max_device_cache_size;
+}
+
+void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
+{
+  lazy_init();
+  CUFILE_TRY(cuFileAPI::instance().DriverSetMaxCacheSize(size_in_kb));
+  _props.max_device_cache_size = size_in_kb;
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size()
+{
+  lazy_init();
+  return _props.per_buffer_cache_size;
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size()
+{
+  lazy_init();
+  return _props.max_device_pinned_mem_size;
+}
+
+void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
+{
+  lazy_init();
+  CUFILE_TRY(cuFileAPI::instance().DriverSetMaxPinnedMemSize(size_in_kb));
+  _props.max_device_pinned_mem_size = size_in_kb;
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size()
+{
+#ifdef KVIKIO_CUFILE_BATCH_API_FOUND
+  lazy_init();
+  return _props.max_batch_io_size;
+#else
+  return 0;
+#endif
+}
+
+#else
+DriverInitializer::DriverInitializer() {}
+
+DriverProperties::DriverProperties() {}
+
+static bool DriverProperties::is_gds_available() { return false; }
+
+[[nodiscard]] static unsigned int DriverProperties::get_nvfs_major_version()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static unsigned int DriverProperties::get_nvfs_minor_version()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static bool DriverProperties::get_nvfs_allow_compat_mode()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static bool DriverProperties::get_nvfs_poll_mode()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static std::size_t DriverProperties::get_nvfs_poll_thresh_size()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+static void DriverProperties::set_nvfs_poll_mode(bool enable)
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+static void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static std::size_t DriverProperties::get_max_device_cache_size()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+static void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static std::size_t DriverProperties::get_per_buffer_cache_size()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] static std::size_t DriverProperties::get_max_pinned_memory_size()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+static void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+
+[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size()
+{
+  throw CUfileException("KvikIO not compiled with cuFile.h");
+}
+#endif
+
+}  // namespace kvikio
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
new file mode 100644
index 0000000000..b6a1b3babf
--- /dev/null
+++ b/cpp/src/shim/cuda.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+#include <kvikio/shim/cuda.hpp>
+
+namespace kvikio {
+
+#ifdef KVIKIO_CUDA_FOUND
+cudaAPI::cudaAPI()
+{
+  void* lib = load_library("libcuda.so.1");
+  // Notice, the API version loaded must match the version used downstream. That is,
+  // if a project uses the `_v2` CUDA Driver API or the newest Runtime API, the symbols
+  // loaded should also be the `_v2` symbols. Thus, we use KVIKIO_STRINGIFY() to get
+  // the name of the symbol through cude.h.
+  get_symbol(MemHostAlloc, lib, KVIKIO_STRINGIFY(cuMemHostAlloc));
+  get_symbol(MemFreeHost, lib, KVIKIO_STRINGIFY(cuMemFreeHost));
+  get_symbol(MemcpyHtoDAsync, lib, KVIKIO_STRINGIFY(cuMemcpyHtoDAsync));
+  get_symbol(MemcpyDtoHAsync, lib, KVIKIO_STRINGIFY(cuMemcpyDtoHAsync));
+  get_symbol(PointerGetAttribute, lib, KVIKIO_STRINGIFY(cuPointerGetAttribute));
+  get_symbol(PointerGetAttributes, lib, KVIKIO_STRINGIFY(cuPointerGetAttributes));
+  get_symbol(CtxPushCurrent, lib, KVIKIO_STRINGIFY(cuCtxPushCurrent));
+  get_symbol(CtxPopCurrent, lib, KVIKIO_STRINGIFY(cuCtxPopCurrent));
+  get_symbol(CtxGetCurrent, lib, KVIKIO_STRINGIFY(cuCtxGetCurrent));
+  get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
+  get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
+  get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
+  get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
+  get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
+  get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
+  get_symbol(StreamSynchronize, lib, KVIKIO_STRINGIFY(cuStreamSynchronize));
+  get_symbol(StreamCreate, lib, KVIKIO_STRINGIFY(cuStreamCreate));
+  get_symbol(StreamDestroy, lib, KVIKIO_STRINGIFY(cuStreamDestroy));
+}
+#else
+cudaAPI::cudaAPI() { throw std::runtime_error("KvikIO not compiled with CUDA support"); }
+#endif
+
+cudaAPI& cudaAPI::instance()
+{
+  static cudaAPI _instance;
+  return _instance;
+}
+
+#ifdef KVIKIO_CUDA_FOUND
+bool is_cuda_available()
+{
+  try {
+    cudaAPI::instance();
+  } catch (const std::runtime_error&) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+}  // namespace kvikio
diff --git a/cpp/src/shim/cufile.cpp b/cpp/src/shim/cufile.cpp
new file mode 100644
index 0000000000..a6758cab7b
--- /dev/null
+++ b/cpp/src/shim/cufile.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+#include <string>
+
+#include <kvikio/shim/cufile.hpp>
+#include <kvikio/shim/cufile_h_wrapper.hpp>
+#include <kvikio/shim/utils.hpp>
+
+namespace kvikio {
+
+#ifdef KVIKIO_CUFILE_FOUND
+cuFileAPI::cuFileAPI()
+{
+  // CUDA versions before CUDA 11.7.1 did not ship libcufile.so.0, so this is
+  // a workaround that adds support for all prior versions of libcufile.
+  void* lib = load_library({"libcufile.so.0",
+                            "libcufile.so.1.3.0" /* 11.7.0 */,
+                            "libcufile.so.1.2.1" /* 11.6.2, 11.6.1 */,
+                            "libcufile.so.1.2.0" /* 11.6.0 */,
+                            "libcufile.so.1.1.1" /* 11.5.1 */,
+                            "libcufile.so.1.1.0" /* 11.5.0 */,
+                            "libcufile.so.1.0.2" /* 11.4.4, 11.4.3, 11.4.2 */,
+                            "libcufile.so.1.0.1" /* 11.4.1 */,
+                            "libcufile.so.1.0.0" /* 11.4.0 */});
+  get_symbol(HandleRegister, lib, KVIKIO_STRINGIFY(cuFileHandleRegister));
+  get_symbol(HandleDeregister, lib, KVIKIO_STRINGIFY(cuFileHandleDeregister));
+  get_symbol(Read, lib, KVIKIO_STRINGIFY(cuFileRead));
+  get_symbol(Write, lib, KVIKIO_STRINGIFY(cuFileWrite));
+  get_symbol(BufRegister, lib, KVIKIO_STRINGIFY(cuFileBufRegister));
+  get_symbol(BufDeregister, lib, KVIKIO_STRINGIFY(cuFileBufDeregister));
+  get_symbol(DriverOpen, lib, KVIKIO_STRINGIFY(cuFileDriverOpen));
+  get_symbol(DriverClose, lib, KVIKIO_STRINGIFY(cuFileDriverClose));
+  get_symbol(DriverGetProperties, lib, KVIKIO_STRINGIFY(cuFileDriverGetProperties));
+  get_symbol(DriverSetPollMode, lib, KVIKIO_STRINGIFY(cuFileDriverSetPollMode));
+  get_symbol(DriverSetMaxCacheSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxCacheSize));
+  get_symbol(DriverSetMaxPinnedMemSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxPinnedMemSize));
+
+#ifdef KVIKIO_CUFILE_VERSION_API_FOUND
+  try {
+    get_symbol(GetVersion, lib, KVIKIO_STRINGIFY(cuFileGetVersion));
+    int ver;
+    CUfileError_t const error = GetVersion(&ver);
+    if (error.err == CU_FILE_SUCCESS) { version = ver; }
+  } catch (std::runtime_error const&) {
+  }
+#endif
+
+  // Some symbols were introduced in later versions, so version guards are required.
+  // Note: `version` is 0 for cuFile versions prior to v1.8 because `cuFileGetVersion`
+  // did not exist. As a result, the batch and stream APIs are not loaded in versions
+  // 1.6 and 1.7, respectively, even though they are available. This trade-off is made
+  // for improved robustness.
+  if (version >= 1060) {
+    get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp));
+    get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit));
+    get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus));
+    get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel));
+    get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy));
+  }
+  if (version >= 1070) {
+    get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync));
+    get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync));
+    get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
+    get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
+  }
+
+  // cuFile is supposed to open and close the driver automatically but
+  // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults:
+  // <https://github.com/rapidsai/kvikio/issues/159>.
+  if (version < 1050) { driver_open(); }
+}
+
+// Notice, we have to close the driver at program exit (if we opened it) even though we are
+// not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
+// driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
+// this seems to be the lesser of two evils.
+// [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
+cuFileAPI::~cuFileAPI()
+{
+  if (version < 1050) { driver_close(); }
+}
+#else
+cuFileAPI::cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); }
+#endif
+
+cuFileAPI& cuFileAPI::instance()
+{
+  static cuFileAPI _instance;
+  return _instance;
+}
+
+void cuFileAPI::driver_open()
+{
+  CUfileError_t const error = DriverOpen();
+  if (error.err != CU_FILE_SUCCESS) {
+    throw std::runtime_error(std::string{"Unable to open GDS file driver: "} +
+                             cufileop_status_error(error.err));
+  }
+}
+
+void cuFileAPI::driver_close()
+{
+  CUfileError_t const error = DriverClose();
+  if (error.err != CU_FILE_SUCCESS) {
+    throw std::runtime_error(std::string{"Unable to close GDS file driver: "} +
+                             cufileop_status_error(error.err));
+  }
+}
+
+#ifdef KVIKIO_CUFILE_FOUND
+bool is_cufile_library_available()
+{
+  try {
+    cuFileAPI::instance();
+  } catch (const std::runtime_error&) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+bool is_cufile_available()
+{
+  return is_cufile_library_available() && run_udev_readable() && !is_running_in_wsl();
+}
+
+#ifdef KVIKIO_CUFILE_FOUND
+int cufile_version()
+{
+  try {
+    return cuFileAPI::instance().version;
+  } catch (std::runtime_error const&) {
+    return 0;
+  }
+}
+#endif
+
+bool is_batch_api_available() noexcept { return cufile_version() >= 1060; }
+
+bool is_stream_api_available() noexcept { return cufile_version() >= 1070; }
+
+}  // namespace kvikio
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
new file mode 100644
index 0000000000..655a7f70fc
--- /dev/null
+++ b/cpp/src/shim/libcurl.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <curl/curl.h>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/parallel_operation.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+LibCurl::LibCurl()
+{
+  CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT);
+  if (err != CURLE_OK) {
+    throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err));
+  }
+  curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW);
+  if ((ver->features & CURL_VERSION_THREADSAFE) == 0) {
+    throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled");
+  }
+}
+
+LibCurl::~LibCurl() noexcept
+{
+  _free_curl_handles.clear();
+  curl_global_cleanup();
+}
+
+LibCurl& LibCurl::instance()
+{
+  static LibCurl _instance;
+  return _instance;
+}
+
+LibCurl::UniqueHandlePtr LibCurl::get_free_handle()
+{
+  UniqueHandlePtr ret;
+  std::lock_guard const lock(_mutex);
+  if (!_free_curl_handles.empty()) {
+    ret = std::move(_free_curl_handles.back());
+    _free_curl_handles.pop_back();
+  }
+  return ret;
+}
+
+LibCurl::UniqueHandlePtr LibCurl::get_handle()
+{
+  // Check if we have a free handle available.
+  UniqueHandlePtr ret = get_free_handle();
+  if (ret) {
+    curl_easy_reset(ret.get());
+  } else {
+    // If not, we create a new handle.
+    CURL* raw_handle = curl_easy_init();
+    if (raw_handle == nullptr) {
+      throw std::runtime_error("libcurl: call to curl_easy_init() failed");
+    }
+    ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup);
+  }
+  return ret;
+}
+
+void LibCurl::retain_handle(UniqueHandlePtr handle)
+{
+  std::lock_guard const lock(_mutex);
+  _free_curl_handles.push_back(std::move(handle));
+}
+
+CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
+                       std::string source_file,
+                       std::string source_line)
+  : _handle{std::move(handle)},
+    _source_file(std::move(source_file)),
+    _source_line(std::move(source_line))
+{
+  // Need CURLOPT_NOSIGNAL to support threading, see
+  // <https://curl.se/libcurl/c/CURLOPT_NOSIGNAL.html>
+  setopt(CURLOPT_NOSIGNAL, 1L);
+
+  // We always set CURLOPT_ERRORBUFFER to get better error messages.
+  _errbuf[0] = 0;  // Set the error buffer as empty.
+  setopt(CURLOPT_ERRORBUFFER, _errbuf);
+
+  // Make curl_easy_perform() fail when receiving HTTP code errors.
+  setopt(CURLOPT_FAILONERROR, 1L);
+}
+
+CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
+
+CURL* CurlHandle::handle() noexcept { return _handle.get(); }
+
+void CurlHandle::perform()
+{
+  // Perform the curl operation and check for errors.
+  CURLcode err = curl_easy_perform(handle());
+  if (err != CURLE_OK) {
+    std::string msg(_errbuf);  // We can do this because we always initialize `_errbuf` as empty.
+    std::stringstream ss;
+    ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line;
+    if (msg.empty()) {
+      ss << "(" << curl_easy_strerror(err) << ")";
+    } else {
+      ss << "(" << msg << ")";
+    }
+    throw std::runtime_error(ss.str());
+  }
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp
new file mode 100644
index 0000000000..ab9afbf648
--- /dev/null
+++ b/cpp/src/shim/utils.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlfcn.h>
+#include <sys/utsname.h>
+#include <filesystem>
+#include <sstream>
+#include <vector>
+
+#include <kvikio/shim/utils.hpp>
+
+namespace kvikio {
+
+void* load_library(const char* name, int mode)
+{
+  ::dlerror();  // Clear old errors
+  void* ret = ::dlopen(name, mode);
+  if (ret == nullptr) { throw std::runtime_error(::dlerror()); }
+  return ret;
+}
+
+void* load_library(const std::vector<const char*>& names, int mode)
+{
+  std::stringstream ss;
+  for (const char* name : names) {
+    ss << name << " ";
+    try {
+      return load_library(name, mode);
+    } catch (const std::runtime_error&) {
+    }
+  }
+  throw std::runtime_error("cannot open shared object file, tried: " + ss.str());
+}
+
+[[nodiscard]] bool is_running_in_wsl()
+{
+  struct utsname buf {};
+  int err = ::uname(&buf);
+  if (err == 0) {
+    const std::string name(static_cast<char*>(buf.release));
+    // 'Microsoft' for WSL1 and 'microsoft' for WSL2
+    return name.find("icrosoft") != std::string::npos;
+  }
+  return false;
+}
+
+[[nodiscard]] bool run_udev_readable()
+{
+  try {
+    return std::filesystem::is_directory("/run/udev");
+  } catch (const std::filesystem::filesystem_error&) {
+    return false;
+  }
+}
+
+}  // namespace kvikio

From 786b8a2746bf310b9f5ff53401afb8fd296043e8 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 8 Jan 2025 10:50:30 -0500
Subject: [PATCH 2/8] Fixes

---
 cpp/include/kvikio/cufile/driver.hpp | 49 +++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
index 269761c75d..4a33289bda 100644
--- a/cpp/include/kvikio/cufile/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -22,6 +22,8 @@
 
 namespace kvikio {
 
+#ifdef KVIKIO_CUFILE_FOUND
+
 class DriverInitializer {
   // Optional, if not used cuFiles opens the driver automatically
  public:
@@ -45,12 +47,7 @@ class DriverProperties {
   void lazy_init();
 
  public:
-#ifdef KVIKIO_CUFILE_FOUND
   DriverProperties() = default;
-#else
-  // Implement a non-default constructor to avoid `unused variable` warnings downstream
-  DriverProperties();
-#endif
 
   bool is_gds_available();
 
@@ -83,4 +80,46 @@ class DriverProperties {
   [[nodiscard]] std::size_t get_max_batch_io_size();
 };
 
+#else
+struct DriverInitializer {
+  // Implement a non-default constructor to avoid `unused variable` warnings downstream
+  DriverInitializer();
+};
+
+struct DriverProperties {
+  // Implement a non-default constructor to avoid `unused variable` warnings downstream
+  DriverProperties();
+
+  static bool is_gds_available();
+
+  [[nodiscard]] static unsigned int get_nvfs_major_version();
+
+  [[nodiscard]] static unsigned int get_nvfs_minor_version();
+
+  [[nodiscard]] static bool get_nvfs_allow_compat_mode();
+
+  [[nodiscard]] static bool get_nvfs_poll_mode();
+
+  [[nodiscard]] static std::size_t get_nvfs_poll_thresh_size();
+
+  static void set_nvfs_poll_mode(bool enable);
+
+  static void set_nvfs_poll_thresh_size(std::size_t size_in_kb);
+
+  [[nodiscard]] static std::vector<CUfileDriverControlFlags> get_nvfs_statusflags();
+
+  [[nodiscard]] static std::size_t get_max_device_cache_size();
+
+  static void set_max_device_cache_size(std::size_t size_in_kb);
+
+  [[nodiscard]] static std::size_t get_per_buffer_cache_size();
+
+  [[nodiscard]] static std::size_t get_max_pinned_memory_size();
+
+  static void set_max_pinned_memory_size(std::size_t size_in_kb);
+
+  [[nodiscard]] std::size_t get_max_batch_io_size();
+};
+#endif
+
 }  // namespace kvikio

From 074ccff5531938ea3a78d64215a083d38c031f6b Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 8 Jan 2025 11:22:21 -0500
Subject: [PATCH 3/8] Fixes. Continue the work

---
 cpp/include/kvikio/utils.hpp | 152 +++-------------------------
 cpp/src/cufile/driver.cpp    |  28 +++---
 cpp/src/utils.cpp            | 186 +++++++++++++++++++++++++++++++++++
 3 files changed, 214 insertions(+), 152 deletions(-)
 create mode 100644 cpp/src/utils.cpp

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 3cad457ffa..5593c6c9de 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,6 @@
 #include <chrono>
 #include <cstring>
 #include <future>
-#include <iostream>
-#include <map>
 #include <optional>
 #include <stdexcept>
 #include <tuple>
@@ -29,7 +27,6 @@
 #include <nvtx3/nvtx3.hpp>
 #endif
 
-#include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 
 namespace kvikio {
@@ -37,27 +34,11 @@ namespace kvikio {
 // cuFile defines a page size to 4 KiB
 inline constexpr std::size_t page_size = 4096;
 
-[[nodiscard]] inline off_t convert_size2off(std::size_t x)
-{
-  if (x >= static_cast<std::size_t>(std::numeric_limits<off_t>::max())) {
-    throw CUfileException("size_t argument too large to fit off_t");
-  }
-  return static_cast<off_t>(x);
-}
+[[nodiscard]] off_t convert_size2off(std::size_t x);
 
-[[nodiscard]] inline ssize_t convert_size2ssize(std::size_t x)
-{
-  if (x >= static_cast<std::size_t>(std::numeric_limits<ssize_t>::max())) {
-    throw CUfileException("size_t argument too large to fit ssize_t");
-  }
-  return static_cast<ssize_t>(x);
-}
+[[nodiscard]] ssize_t convert_size2ssize(std::size_t x);
 
-[[nodiscard]] inline CUdeviceptr convert_void2deviceptr(const void* devPtr)
-{
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-  return reinterpret_cast<CUdeviceptr>(devPtr);
-}
+[[nodiscard]] CUdeviceptr convert_void2deviceptr(const void* devPtr);
 
 /**
  * @brief Help function to convert value to 64 bit signed integer
@@ -91,25 +72,7 @@ template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
  * @return The boolean answer
  */
 #ifdef KVIKIO_CUDA_FOUND
-inline bool is_host_memory(const void* ptr)
-{
-  CUpointer_attribute attrs[1] = {
-    CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-  };
-  CUmemorytype memtype{};
-  void* data[1] = {&memtype};
-  CUresult result =
-    cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
-
-  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
-  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
-  CUDA_DRIVER_TRY(result);
-
-  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
-  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
-  // does it to support `cudaMemoryTypeUnregistered`.
-  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
-}
+bool is_host_memory(const void* ptr);
 #else
 constexpr bool is_host_memory(const void* ptr) { return true; }
 #endif
@@ -120,13 +83,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param ptr Device pointer to query
  * @return The device ordinal
  */
-[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
-{
-  int ret = 0;
-  CUDA_DRIVER_TRY(
-    cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
-  return ret;
-}
+[[nodiscard]] int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr);
 
 /**
  * @brief Given a device ordinal, return the primary context of the device.
@@ -136,25 +93,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
  * @return Primary CUDA context
  */
-[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal)
-{
-  static std::map<int, CUcontext> _cache;
-  static std::mutex _mutex;
-  std::lock_guard const lock(_mutex);
-
-  if (_cache.find(ordinal) == _cache.end()) {
-    CUdevice dev{};
-    CUcontext ctx{};
-    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
-
-    // Notice, we let the primary context leak at program exit. We do this because `_cache`
-    // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main:
-    // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
-    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
-    _cache.emplace(ordinal, ctx);
-  }
-  return _cache.at(ordinal);
-}
+[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal);
 
 /**
  * @brief Return the CUDA context associated the given device pointer, if any.
@@ -162,15 +101,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param dev_ptr Device pointer to query
  * @return Usable CUDA context, if one were found.
  */
-[[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
-{
-  CUcontext ctx = nullptr;
-  const CUresult err =
-    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
-  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
-  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
-  return {};
-}
+[[nodiscard]] std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr);
 
 /**
  * @brief Check if the current CUDA context can access the given device pointer
@@ -178,15 +109,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param dev_ptr Device pointer to query
  * @return The boolean answer
  */
-[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
-{
-  CUdeviceptr current_ctx_dev_ptr{};
-  const CUresult err = cudaAPI::instance().PointerGetAttribute(
-    &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
-  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
-  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
-  return false;
-}
+[[nodiscard]] bool current_context_can_access_pointer(CUdeviceptr dev_ptr);
 
 /**
  * @brief Return a CUDA context that can be used with the given device pointer
@@ -204,28 +127,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param devPtr Device pointer to query
  * @return Usable CUDA context
  */
-[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
-{
-  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
-
-  // First we check if a context has been associated with `devPtr`.
-  {
-    auto ctx = get_context_associated_pointer(dev_ptr);
-    if (ctx.has_value()) { return ctx.value(); }
-  }
-
-  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
-  // return the current context.
-  {
-    CUcontext ctx = nullptr;
-    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
-  }
-
-  // Finally, if we didn't find any usable context, we return the primary context of the
-  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
-  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
-}
+[[nodiscard]] CUcontext get_context_from_pointer(const void* devPtr);
 
 /**
  * @brief Push CUDA context on creation and pop it on destruction
@@ -235,43 +137,17 @@ class PushAndPopContext {
   CUcontext _ctx;
 
  public:
-  PushAndPopContext(CUcontext ctx) : _ctx{ctx}
-  {
-    CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
-  }
+  PushAndPopContext(CUcontext ctx);
   PushAndPopContext(const PushAndPopContext&)            = delete;
   PushAndPopContext& operator=(PushAndPopContext const&) = delete;
   PushAndPopContext(PushAndPopContext&&)                 = delete;
   PushAndPopContext&& operator=(PushAndPopContext&&)     = delete;
-  ~PushAndPopContext()
-  {
-    try {
-      CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException);
-    } catch (const CUfileException& e) {
-      std::cerr << e.what() << std::endl;
-    }
-  }
+  ~PushAndPopContext();
 };
 
 // Find the base and offset of the memory allocation `devPtr` is in
-inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
-                                                                  CUcontext* ctx = nullptr)
-{
-  auto dev = convert_void2deviceptr(devPtr);
-  CUdeviceptr base_ptr{};
-  std::size_t base_size{};
-  CUcontext _ctx{};
-  if (ctx != nullptr) {
-    _ctx = *ctx;
-  } else {
-    _ctx = get_context_from_pointer(devPtr);
-  }
-  PushAndPopContext context(_ctx);
-  CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
-  std::size_t offset = dev - base_ptr;
-  // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast)
-  return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
-}
+std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
+                                                           CUcontext* ctx = nullptr);
 
 template <typename T>
 inline bool is_future_done(const T& future)
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
index 959a64b33b..127050ed06 100644
--- a/cpp/src/cufile/driver.cpp
+++ b/cpp/src/cufile/driver.cpp
@@ -174,69 +174,69 @@ DriverInitializer::DriverInitializer() {}
 
 DriverProperties::DriverProperties() {}
 
-static bool DriverProperties::is_gds_available() { return false; }
+bool DriverProperties::is_gds_available() { return false; }
 
-[[nodiscard]] static unsigned int DriverProperties::get_nvfs_major_version()
+[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static unsigned int DriverProperties::get_nvfs_minor_version()
+[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static bool DriverProperties::get_nvfs_allow_compat_mode()
+[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static bool DriverProperties::get_nvfs_poll_mode()
+[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static std::size_t DriverProperties::get_nvfs_poll_thresh_size()
+[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-static void DriverProperties::set_nvfs_poll_mode(bool enable)
+void DriverProperties::set_nvfs_poll_mode(bool enable)
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-static void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
+void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
+[[nodiscard]] std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static std::size_t DriverProperties::get_max_device_cache_size()
+[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-static void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
+void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static std::size_t DriverProperties::get_per_buffer_cache_size()
+[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] static std::size_t DriverProperties::get_max_pinned_memory_size()
+[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-static void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
+void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
new file mode 100644
index 0000000000..32834cf3a4
--- /dev/null
+++ b/cpp/src/utils.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+#include <future>
+#include <iostream>
+#include <map>
+#include <optional>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+
+#ifdef KVIKIO_CUDA_FOUND
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cuda.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+off_t convert_size2off(std::size_t x)
+{
+  if (x >= static_cast<std::size_t>(std::numeric_limits<off_t>::max())) {
+    throw CUfileException("size_t argument too large to fit off_t");
+  }
+  return static_cast<off_t>(x);
+}
+
+ssize_t convert_size2ssize(std::size_t x)
+{
+  if (x >= static_cast<std::size_t>(std::numeric_limits<ssize_t>::max())) {
+    throw CUfileException("size_t argument too large to fit ssize_t");
+  }
+  return static_cast<ssize_t>(x);
+}
+
+CUdeviceptr convert_void2deviceptr(const void* devPtr)
+{
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+  return reinterpret_cast<CUdeviceptr>(devPtr);
+}
+
+#ifdef KVIKIO_CUDA_FOUND
+bool is_host_memory(const void* ptr)
+{
+  CUpointer_attribute attrs[1] = {
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+  };
+  CUmemorytype memtype{};
+  void* data[1] = {&memtype};
+  CUresult result =
+    cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
+
+  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
+  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
+  CUDA_DRIVER_TRY(result);
+
+  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
+  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
+  // does it to support `cudaMemoryTypeUnregistered`.
+  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
+}
+#endif
+
+int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
+{
+  int ret = 0;
+  CUDA_DRIVER_TRY(
+    cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
+  return ret;
+}
+
+CUcontext get_primary_cuda_context(int ordinal)
+{
+  static std::map<int, CUcontext> _cache;
+  static std::mutex _mutex;
+  std::lock_guard const lock(_mutex);
+
+  if (_cache.find(ordinal) == _cache.end()) {
+    CUdevice dev{};
+    CUcontext ctx{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, ordinal));
+
+    // Notice, we let the primary context leak at program exit. We do this because `_cache`
+    // is static and we are not allowed to call `cuDevicePrimaryCtxRelease()` after main:
+    // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
+    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
+    _cache.emplace(ordinal, ctx);
+  }
+  return _cache.at(ordinal);
+}
+
+std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
+{
+  CUcontext ctx = nullptr;
+  const CUresult err =
+    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
+  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return {};
+}
+
+bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
+{
+  CUdeviceptr current_ctx_dev_ptr{};
+  const CUresult err = cudaAPI::instance().PointerGetAttribute(
+    &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
+  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return false;
+}
+
+CUcontext get_context_from_pointer(const void* devPtr)
+{
+  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
+
+  // First we check if a context has been associated with `devPtr`.
+  {
+    auto ctx = get_context_associated_pointer(dev_ptr);
+    if (ctx.has_value()) { return ctx.value(); }
+  }
+
+  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
+  // return the current context.
+  {
+    CUcontext ctx = nullptr;
+    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
+    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
+  }
+
+  // Finally, if we didn't find any usable context, we return the primary context of the
+  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
+  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
+}
+
+PushAndPopContext::PushAndPopContext(CUcontext ctx) : _ctx{ctx}
+{
+  CUDA_DRIVER_TRY(cudaAPI::instance().CtxPushCurrent(_ctx));
+}
+
+PushAndPopContext::~PushAndPopContext()
+{
+  try {
+    CUDA_DRIVER_TRY(cudaAPI::instance().CtxPopCurrent(&_ctx), CUfileException);
+  } catch (const CUfileException& e) {
+    std::cerr << e.what() << std::endl;
+  }
+}
+
+// Find the base and offset of the memory allocation `devPtr` is in
+std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
+                                                           CUcontext* ctx = nullptr)
+{
+  auto dev = convert_void2deviceptr(devPtr);
+  CUdeviceptr base_ptr{};
+  std::size_t base_size{};
+  CUcontext _ctx{};
+  if (ctx != nullptr) {
+    _ctx = *ctx;
+  } else {
+    _ctx = get_context_from_pointer(devPtr);
+  }
+  PushAndPopContext context(_ctx);
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
+  std::size_t offset = dev - base_ptr;
+  // NOLINTNEXTLINE(performance-no-int-to-ptr, cppcoreguidelines-pro-type-reinterpret-cast)
+  return std::make_tuple(reinterpret_cast<void*>(base_ptr), base_size, offset);
+}
+
+}  // namespace kvikio

From 76830b9c3c84d74333e516c40313a16a8418318c Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Wed, 8 Jan 2025 16:08:32 -0500
Subject: [PATCH 4/8] Complete the initial separation

---
 cpp/CMakeLists.txt                   |  18 ++-
 cpp/include/kvikio/batch.hpp         |  84 +++--------
 cpp/include/kvikio/bounce_buffer.hpp |  84 ++---------
 cpp/include/kvikio/buffer.hpp        |  52 ++-----
 cpp/include/kvikio/defaults.hpp      | 175 +++------------------
 cpp/include/kvikio/posix_io.hpp      |  58 ++-----
 cpp/include/kvikio/remote_handle.hpp | 200 ++----------------------
 cpp/include/kvikio/stream.hpp        |  79 ++--------
 cpp/src/batch.cpp                    | 118 +++++++++++++++
 cpp/src/bounce_buffer.cpp            | 107 +++++++++++++
 cpp/src/buffer.cpp                   |  64 ++++++++
 cpp/src/cufile/config.cpp            |   6 +-
 cpp/src/cufile/driver.cpp            |  54 +++----
 cpp/src/defaults.cpp                 | 203 +++++++++++++++++++++++++
 cpp/src/error.cpp                    |  17 +++
 cpp/src/file_handle.cpp              |   6 +-
 cpp/src/posix_io.cpp                 |  79 ++++++++++
 cpp/src/remote_handle.cpp            | 218 ++++++++++++++++++++++++++-
 cpp/src/shim/utils.cpp               |   4 +-
 cpp/src/stream.cpp                   | 102 +++++++++++++
 cpp/src/utils.cpp                    |   5 +-
 21 files changed, 1054 insertions(+), 679 deletions(-)
 create mode 100644 cpp/src/batch.cpp
 create mode 100644 cpp/src/bounce_buffer.cpp
 create mode 100644 cpp/src/buffer.cpp
 create mode 100644 cpp/src/defaults.cpp
 create mode 100644 cpp/src/error.cpp
 create mode 100644 cpp/src/posix_io.cpp
 create mode 100644 cpp/src/stream.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ea7d29a06f..5f288d370f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -131,8 +131,22 @@ include(cmake/thirdparty/get_thread_pool.cmake)
 # ##################################################################################################
 # * library targets --------------------------------------------------------------------------------
 
-set(SOURCES "src/file_handle.cpp" "src/cufile/config.cpp" "src/cufile/driver.cpp"
-            "src/shim/cuda.cpp" "src/shim/cufile.cpp" "src/shim/libcurl.cpp" "src/shim/utils.cpp"
+set(SOURCES
+    "src/batch.cpp"
+    "src/bounce_buffer.cpp"
+    "src/buffer.cpp"
+    "src/cufile/config.cpp"
+    "src/cufile/driver.cpp"
+    "src/defaults.cpp"
+    "src/error.cpp"
+    "src/file_handle.cpp"
+    "src/posix_io.cpp"
+    "src/shim/cuda.cpp"
+    "src/shim/cufile.cpp"
+    "src/shim/libcurl.cpp"
+    "src/shim/utils.cpp"
+    "src/stream.cpp"
+    "src/utils.cpp"
 )
 
 if(KvikIO_REMOTE_SUPPORT)
diff --git a/cpp/include/kvikio/batch.hpp b/cpp/include/kvikio/batch.hpp
index 7eebbd4df0..9927962a65 100644
--- a/cpp/include/kvikio/batch.hpp
+++ b/cpp/include/kvikio/batch.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,36 +73,22 @@ class BatchHandle {
    *
    * @param max_num_events The maximum number of operations supported by this instance.
    */
-  BatchHandle(int max_num_events) : _initialized{true}, _max_num_events{max_num_events}
-  {
-    CUFILE_TRY(cuFileAPI::instance().BatchIOSetUp(&_handle, max_num_events));
-  }
+  BatchHandle(int max_num_events);
 
   /**
    * @brief BatchHandle support move semantic but isn't copyable
    */
   BatchHandle(const BatchHandle&)            = delete;
   BatchHandle& operator=(BatchHandle const&) = delete;
-  BatchHandle(BatchHandle&& o) noexcept
-    : _initialized{std::exchange(o._initialized, false)},
-      _max_num_events{std::exchange(o._max_num_events, 0)}
-  {
-    _handle = std::exchange(o._handle, CUfileBatchHandle_t{});
-  }
-  ~BatchHandle() noexcept { close(); }
+  BatchHandle(BatchHandle&& o) noexcept;
+  ~BatchHandle() noexcept;
 
-  [[nodiscard]] bool closed() const noexcept { return !_initialized; }
+  [[nodiscard]] bool closed() const noexcept;
 
   /**
    * @brief Destroy the batch handle and free up resources
    */
-  void close() noexcept
-  {
-    if (closed()) { return; }
-    _initialized = false;
-
-    cuFileAPI::instance().BatchIODestroy(_handle);
-  }
+  void close() noexcept;
 
   /**
    * @brief Submit a vector of batch operations
@@ -110,31 +96,7 @@ class BatchHandle {
    * @param operations The vector of batch operations, which must not exceed the
    * `max_num_events`.
    */
-  void submit(const std::vector<BatchOp>& operations)
-  {
-    if (convert_size2ssize(operations.size()) > _max_num_events) {
-      throw CUfileException("Cannot submit more than the max_num_events)");
-    }
-    std::vector<CUfileIOParams_t> io_batch_params;
-    io_batch_params.reserve(operations.size());
-    for (const auto& op : operations) {
-      if (op.file_handle.is_compat_mode_preferred()) {
-        throw CUfileException("Cannot submit a FileHandle opened in compatibility mode");
-      }
-
-      io_batch_params.push_back(CUfileIOParams_t{.mode   = CUFILE_BATCH,
-                                                 .u      = {.batch = {.devPtr_base   = op.devPtr_base,
-                                                                      .file_offset   = op.file_offset,
-                                                                      .devPtr_offset = op.devPtr_offset,
-                                                                      .size          = op.size}},
-                                                 .fh     = op.file_handle.handle(),
-                                                 .opcode = op.opcode,
-                                                 .cookie = nullptr});
-    }
-
-    CUFILE_TRY(cuFileAPI::instance().BatchIOSubmit(
-      _handle, io_batch_params.size(), io_batch_params.data(), 0));
-  }
+  void submit(const std::vector<BatchOp>& operations);
 
   /**
    * @brief Get status of submitted operations
@@ -148,16 +110,9 @@ class BatchHandle {
    */
   std::vector<CUfileIOEvents_t> status(unsigned min_nr,
                                        unsigned max_nr,
-                                       struct timespec* timeout = nullptr)
-  {
-    std::vector<CUfileIOEvents_t> ret;
-    ret.resize(_max_num_events);
-    CUFILE_TRY(cuFileAPI::instance().BatchIOGetStatus(_handle, min_nr, &max_nr, &ret[0], timeout));
-    ret.resize(max_nr);
-    return ret;
-  }
-
-  void cancel() { CUFILE_TRY(cuFileAPI::instance().BatchIOCancel(_handle)); }
+                                       struct timespec* timeout = nullptr);
+
+  void cancel();
 };
 
 #else
@@ -166,24 +121,19 @@ class BatchHandle {
  public:
   BatchHandle() noexcept = default;
 
-  BatchHandle(int max_num_events)
-  {
-    throw CUfileException("BatchHandle requires cuFile's batch API, please build with CUDA v12.1+");
-  }
+  BatchHandle(int max_num_events);
 
-  [[nodiscard]] bool closed() const noexcept { return true; }
+  [[nodiscard]] bool closed() const noexcept;
 
-  void close() noexcept {}
+  void close() noexcept;
 
-  void submit(const std::vector<BatchOp>& operations) {}
+  void submit(const std::vector<BatchOp>& operations);
 
   std::vector<CUfileIOEvents_t> status(unsigned min_nr,
                                        unsigned max_nr,
-                                       struct timespec* timeout = nullptr)
-  {
-    return std::vector<CUfileIOEvents_t>{};
-  }
-  void cancel() {}
+                                       struct timespec* timeout = nullptr);
+
+  void cancel();
 };
 
 #endif
diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
index 498f1d6f5f..5a7623a6a4 100644
--- a/cpp/include/kvikio/bounce_buffer.hpp
+++ b/cpp/include/kvikio/bounce_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <mutex>
 #include <stack>
 
 #include <kvikio/defaults.hpp>
@@ -47,18 +46,15 @@ class AllocRetain {
     std::size_t const _size;
 
    public:
-    Alloc(AllocRetain* manager, void* alloc, std::size_t size)
-      : _manager(manager), _alloc{alloc}, _size{size}
-    {
-    }
+    Alloc(AllocRetain* manager, void* alloc, std::size_t size);
     Alloc(Alloc const&)            = delete;
     Alloc& operator=(Alloc const&) = delete;
     Alloc(Alloc&& o)               = delete;
     Alloc& operator=(Alloc&& o)    = delete;
-    ~Alloc() noexcept { _manager->put(_alloc, _size); }
-    void* get() noexcept { return _alloc; }
-    void* get(std::ptrdiff_t offset) noexcept { return static_cast<char*>(_alloc) + offset; }
-    std::size_t size() noexcept { return _size; }
+    ~Alloc() noexcept;
+    void* get() noexcept;
+    void* get(std::ptrdiff_t offset) noexcept;
+    std::size_t size() noexcept;
   };
 
   AllocRetain() = default;
@@ -77,80 +73,28 @@ class AllocRetain {
    *
    * @return The number of bytes cleared
    */
-  std::size_t _clear()
-  {
-    std::size_t ret = _free_allocs.size() * _size;
-    while (!_free_allocs.empty()) {
-      CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top()));
-      _free_allocs.pop();
-    }
-    return ret;
-  }
+  std::size_t _clear();
 
   /**
    * @brief Ensure the sizes of the retained allocations match `defaults::bounce_buffer_size()`
    *
    * NB: `_mutex` must be taken prior to calling this function.
    */
-  void _ensure_alloc_size()
-  {
-    auto const bounce_buffer_size = defaults::bounce_buffer_size();
-    if (_size != bounce_buffer_size) {
-      _clear();
-      _size = bounce_buffer_size;
-    }
-  }
+  void _ensure_alloc_size();
 
  public:
-  [[nodiscard]] Alloc get()
-  {
-    std::lock_guard const lock(_mutex);
-    _ensure_alloc_size();
-
-    // Check if we have an allocation available
-    if (!_free_allocs.empty()) {
-      void* ret = _free_allocs.top();
-      _free_allocs.pop();
-      return Alloc(this, ret, _size);
-    }
-
-    // If no available allocation, allocate and register a new one
-    void* alloc{};
-    // Allocate page-locked host memory
-    CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTREGISTER_PORTABLE));
-    return Alloc(this, alloc, _size);
-  }
-
-  void put(void* alloc, std::size_t size)
-  {
-    std::lock_guard const lock(_mutex);
-    _ensure_alloc_size();
-
-    // If the size of `alloc` matches the sizes of the retained allocations,
-    // it is added to the set of free allocation otherwise it is freed.
-    if (size == _size) {
-      _free_allocs.push(alloc);
-    } else {
-      CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc));
-    }
-  }
+  [[nodiscard]] Alloc get();
+
+  void put(void* alloc, std::size_t size);
 
   /**
    * @brief Free all retained allocations
    *
    * @return The number of bytes cleared
    */
-  std::size_t clear()
-  {
-    std::lock_guard const lock(_mutex);
-    return _clear();
-  }
-
-  KVIKIO_EXPORT static AllocRetain& instance()
-  {
-    static AllocRetain _instance;
-    return _instance;
-  }
+  std::size_t clear();
+
+  KVIKIO_EXPORT static AllocRetain& instance();
 
   AllocRetain(AllocRetain const&)            = delete;
   AllocRetain& operator=(AllocRetain const&) = delete;
diff --git a/cpp/include/kvikio/buffer.hpp b/cpp/include/kvikio/buffer.hpp
index 85c60b3f90..9cef45a6f0 100644
--- a/cpp/include/kvikio/buffer.hpp
+++ b/cpp/include/kvikio/buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,17 +15,8 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <iostream>
-#include <map>
 #include <vector>
 
-#include <kvikio/defaults.hpp>
-#include <kvikio/error.hpp>
-#include <kvikio/shim/cufile.hpp>
-#include <kvikio/shim/cufile_h_wrapper.hpp>
-#include <kvikio/utils.hpp>
-
 namespace kvikio {
 
 /**
@@ -44,32 +35,17 @@ namespace kvikio {
  * streaming buffer that is reused across multiple cuFile IO operations.
  */
 /*NOLINTNEXTLINE(readability-function-cognitive-complexity)*/
-inline void buffer_register(const void* devPtr_base,
-                            std::size_t size,
-                            int flags                                = 0,
-                            const std::vector<int>& errors_to_ignore = std::vector<int>())
-{
-  if (defaults::is_compat_mode_preferred()) { return; }
-  CUfileError_t status = cuFileAPI::instance().BufRegister(devPtr_base, size, flags);
-  if (status.err != CU_FILE_SUCCESS) {
-    // Check if `status.err` is in `errors_to_ignore`
-    if (std::find(errors_to_ignore.begin(), errors_to_ignore.end(), status.err) ==
-        errors_to_ignore.end()) {
-      CUFILE_TRY(status);
-    }
-  }
-}
+void buffer_register(const void* devPtr_base,
+                     std::size_t size,
+                     int flags                                = 0,
+                     const std::vector<int>& errors_to_ignore = std::vector<int>());
 
 /**
  * @brief deregister an already registered device memory from cuFile
  *
  * @param devPtr_base  device pointer to deregister
  */
-inline void buffer_deregister(const void* devPtr_base)
-{
-  if (defaults::is_compat_mode_preferred()) { return; }
-  CUFILE_TRY(cuFileAPI::instance().BufDeregister(devPtr_base));
-}
+void buffer_deregister(const void* devPtr_base);
 
 /**
  * @brief Register device memory allocation which is part of devPtr. Use this
@@ -85,23 +61,15 @@ inline void buffer_deregister(const void* devPtr_base)
  * @warning This API is intended for usecases where the memory is used as
  * streaming buffer that is reused across multiple cuFile IO operations.
  */
-inline void memory_register(const void* devPtr,
-                            int flags                                = 0,
-                            const std::vector<int>& errors_to_ignore = {})
-{
-  auto [base, nbytes, offset] = get_alloc_info(devPtr);
-  buffer_register(base, nbytes, flags, errors_to_ignore);
-}
+void memory_register(const void* devPtr,
+                     int flags                                = 0,
+                     const std::vector<int>& errors_to_ignore = {});
 
 /**
  * @brief  deregister an already registered device memory from cuFile.
  *
  * @param devPtr device pointer to deregister
  */
-inline void memory_deregister(const void* devPtr)
-{
-  auto [base, nbytes, offset] = get_alloc_info(devPtr);
-  buffer_deregister(base);
-}
+void memory_deregister(const void* devPtr);
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 91071cbb28..4c87724445 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+// Enable documentation of the enum.
 /**
  * @file
  */
 
 #pragma once
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdlib>
 #include <sstream>
@@ -55,25 +55,7 @@ namespace detail {
  *   - `AUTO`
  * @return A CompatMode enum.
  */
-inline CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
-{
-  // Convert to lowercase
-  std::string tmp{compat_mode_str};
-  std::transform(
-    tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); });
-
-  CompatMode res{};
-  if (tmp == "on" || tmp == "true" || tmp == "yes" || tmp == "1") {
-    res = CompatMode::ON;
-  } else if (tmp == "off" || tmp == "false" || tmp == "no" || tmp == "0") {
-    res = CompatMode::OFF;
-  } else if (tmp == "auto") {
-    res = CompatMode::AUTO;
-  } else {
-    throw std::invalid_argument("Unknown compatibility mode: " + std::string{tmp});
-  }
-  return res;
-}
+CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
 template <typename T>
 T getenv_or(std::string_view env_var_name, T default_val)
@@ -92,44 +74,10 @@ T getenv_or(std::string_view env_var_name, T default_val)
 }
 
 template <>
-inline bool getenv_or(std::string_view env_var_name, bool default_val)
-{
-  const auto* env_val = std::getenv(env_var_name.data());
-  if (env_val == nullptr) { return default_val; }
-  try {
-    // Try parsing `env_var_name` as a integer
-    return static_cast<bool>(std::stoi(env_val));
-  } catch (const std::invalid_argument&) {
-  }
-  // Convert to lowercase
-  std::string str{env_val};
-  // Special considerations regarding the case conversion:
-  // - std::tolower() is not an addressable function. Passing it to std::transform() as
-  //   a function pointer, if the compile turns out successful, causes the program behavior
-  //   "unspecified (possibly ill-formed)", hence the lambda. ::tolower() is addressable
-  //   and does not have this problem, but the following item still applies.
-  // - To avoid UB in std::tolower() or ::tolower(), the character must be cast to unsigned char.
-  std::transform(
-    str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); });
-  // Trim whitespaces
-  std::stringstream trimmer;
-  trimmer << str;
-  str.clear();
-  trimmer >> str;
-  // Match value
-  if (str == "true" || str == "on" || str == "yes") { return true; }
-  if (str == "false" || str == "off" || str == "no") { return false; }
-  throw std::invalid_argument("unknown config value " + std::string{env_var_name} + "=" +
-                              std::string{env_val});
-}
+bool getenv_or(std::string_view env_var_name, bool default_val);
 
 template <>
-inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val)
-{
-  auto* env_val = std::getenv(env_var_name.data());
-  if (env_val == nullptr) { return default_val; }
-  return parse_compat_mode_str(env_val);
-}
+CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val);
 
 }  // namespace detail
 
@@ -145,54 +93,11 @@ class defaults {
   std::size_t _gds_threshold;
   std::size_t _bounce_buffer_size;
 
-  static unsigned int get_num_threads_from_env()
-  {
-    const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1);
-    if (ret <= 0) {
-      throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero");
-    }
-    return ret;
-  }
+  static unsigned int get_num_threads_from_env();
 
-  defaults()
-  {
-    // Determine the default value of `compat_mode`
-    {
-      _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO);
-    }
-    // Determine the default value of `task_size`
-    {
-      const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024);
-      if (env <= 0) {
-        throw std::invalid_argument(
-          "KVIKIO_TASK_SIZE has to be a positive integer greater than zero");
-      }
-      _task_size = env;
-    }
-    // Determine the default value of `gds_threshold`
-    {
-      const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024);
-      if (env < 0) {
-        throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer");
-      }
-      _gds_threshold = env;
-    }
-    // Determine the default value of `bounce_buffer_size`
-    {
-      const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024);
-      if (env <= 0) {
-        throw std::invalid_argument(
-          "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero");
-      }
-      _bounce_buffer_size = env;
-    }
-  }
+  defaults();
 
-  KVIKIO_EXPORT static defaults* instance()
-  {
-    static defaults _instance;
-    return &_instance;
-  }
+  KVIKIO_EXPORT static defaults* instance();
 
  public:
   /**
@@ -213,7 +118,7 @@ class defaults {
    *
    * @return Compatibility mode.
    */
-  [[nodiscard]] static CompatMode compat_mode() { return instance()->_compat_mode; }
+  [[nodiscard]] static CompatMode compat_mode();
 
   /**
    * @brief Reset the value of `kvikio::defaults::compat_mode()`.
@@ -223,7 +128,7 @@ class defaults {
    *
    * @param compat_mode Compatibility mode.
    */
-  static void compat_mode_reset(CompatMode compat_mode) { instance()->_compat_mode = compat_mode; }
+  static void compat_mode_reset(CompatMode compat_mode);
 
   /**
    * @brief Infer the `AUTO` compatibility mode from the system runtime.
@@ -234,16 +139,7 @@ class defaults {
    * (`ON`/`OFF`/`AUTO`) to two (`ON`/`OFF`) so as to determine the actual I/O path. This function
    * is lightweight as the inferred result is cached.
    */
-  static CompatMode infer_compat_mode_if_auto(CompatMode compat_mode)
-  {
-    if (compat_mode == CompatMode::AUTO) {
-      static auto inferred_compat_mode_for_auto = []() -> CompatMode {
-        return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
-      }();
-      return inferred_compat_mode_for_auto;
-    }
-    return compat_mode;
-  }
+  static CompatMode infer_compat_mode_if_auto(CompatMode compat_mode);
 
   /**
    * @brief Given a requested compatibility mode, whether it is expected to reduce to `ON`.
@@ -260,12 +156,7 @@ class defaults {
    * @param compat_mode Compatibility mode.
    * @return Boolean answer.
    */
-  static bool is_compat_mode_preferred(CompatMode compat_mode)
-  {
-    return compat_mode == CompatMode::ON ||
-           (compat_mode == CompatMode::AUTO &&
-            defaults::infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
-  }
+  static bool is_compat_mode_preferred(CompatMode compat_mode);
 
   /**
    * @brief Whether the global compatibility mode from class defaults is expected to be `ON`.
@@ -281,7 +172,7 @@ class defaults {
    *
    * @return Boolean answer.
    */
-  static bool is_compat_mode_preferred() { return is_compat_mode_preferred(compat_mode()); }
+  static bool is_compat_mode_preferred();
 
   /**
    * @brief Get the default thread pool.
@@ -292,7 +183,7 @@ class defaults {
    *
    * @return The the default thread pool instance.
    */
-  [[nodiscard]] static BS::thread_pool& thread_pool() { return instance()->_thread_pool; }
+  [[nodiscard]] static BS::thread_pool& thread_pool();
 
   /**
    * @brief Get the number of threads in the default thread pool.
@@ -302,10 +193,7 @@ class defaults {
    *
    * @return The number of threads.
    */
-  [[nodiscard]] static unsigned int thread_pool_nthreads()
-  {
-    return thread_pool().get_thread_count();
-  }
+  [[nodiscard]] static unsigned int thread_pool_nthreads();
 
   /**
    * @brief Reset the number of threads in the default thread pool. Waits for all currently running
@@ -316,13 +204,7 @@ class defaults {
    *
    * @param nthreads The number of threads to use.
    */
-  static void thread_pool_nthreads_reset(unsigned int nthreads)
-  {
-    if (nthreads == 0) {
-      throw std::invalid_argument("number of threads must be a positive integer greater than zero");
-    }
-    thread_pool().reset(nthreads);
-  }
+  static void thread_pool_nthreads_reset(unsigned int nthreads);
 
   /**
    * @brief Get the default task size used for parallel IO operations.
@@ -332,20 +214,14 @@ class defaults {
    *
    * @return The default task size in bytes.
    */
-  [[nodiscard]] static std::size_t task_size() { return instance()->_task_size; }
+  [[nodiscard]] static std::size_t task_size();
 
   /**
    * @brief Reset the default task size used for parallel IO operations.
    *
    * @param nbytes The default task size in bytes.
    */
-  static void task_size_reset(std::size_t nbytes)
-  {
-    if (nbytes == 0) {
-      throw std::invalid_argument("task size must be a positive integer greater than zero");
-    }
-    instance()->_task_size = nbytes;
-  }
+  static void task_size_reset(std::size_t nbytes);
 
   /**
    * @brief Get the default GDS threshold, which is the minimum size to use GDS (in bytes).
@@ -358,13 +234,13 @@ class defaults {
    *
    * @return The default GDS threshold size in bytes.
    */
-  [[nodiscard]] static std::size_t gds_threshold() { return instance()->_gds_threshold; }
+  [[nodiscard]] static std::size_t gds_threshold();
 
   /**
    * @brief Reset the default GDS threshold, which is the minimum size to use GDS (in bytes).
    * @param nbytes The default GDS threshold size in bytes.
    */
-  static void gds_threshold_reset(std::size_t nbytes) { instance()->_gds_threshold = nbytes; }
+  static void gds_threshold_reset(std::size_t nbytes);
 
   /**
    * @brief Get the size of the bounce buffer used to stage data in host memory.
@@ -374,21 +250,14 @@ class defaults {
    *
    * @return The bounce buffer size in bytes.
    */
-  [[nodiscard]] static std::size_t bounce_buffer_size() { return instance()->_bounce_buffer_size; }
+  [[nodiscard]] static std::size_t bounce_buffer_size();
 
   /**
    * @brief Reset the size of the bounce buffer used to stage data in host memory.
    *
    * @param nbytes The bounce buffer size in bytes.
    */
-  static void bounce_buffer_size_reset(std::size_t nbytes)
-  {
-    if (nbytes == 0) {
-      throw std::invalid_argument(
-        "size of the bounce buffer must be a positive integer greater than zero");
-    }
-    instance()->_bounce_buffer_size = nbytes;
-  }
+  static void bounce_buffer_size_reset(std::size_t nbytes);
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp
index 4327a301ec..7675285d09 100644
--- a/cpp/include/kvikio/posix_io.hpp
+++ b/cpp/include/kvikio/posix_io.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,31 +65,9 @@ class StreamsByThread {
   // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination.
   ~StreamsByThread() = default;
 
-  KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id)
-  {
-    static StreamsByThread _instance;
+  KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id);
 
-    // If no current context, we return the null/default stream
-    if (ctx == nullptr) { return nullptr; }
-    auto key = std::make_pair(ctx, thd_id);
-
-    // Create a new stream if `ctx` doesn't have one.
-    if (auto search = _instance._streams.find(key); search == _instance._streams.end()) {
-      CUstream stream{};
-      CUDA_DRIVER_TRY(cudaAPI::instance().StreamCreate(&stream, CU_STREAM_DEFAULT));
-      _instance._streams[key] = stream;
-      return stream;
-    } else {
-      return search->second;
-    }
-  }
-
-  static CUstream get()
-  {
-    CUcontext ctx{nullptr};
-    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    return get(ctx, std::this_thread::get_id());
-  }
+  static CUstream get();
 
   StreamsByThread(const StreamsByThread&)            = delete;
   StreamsByThread& operator=(StreamsByThread const&) = delete;
@@ -251,16 +229,11 @@ std::size_t posix_host_write(int fd, const void* buf, std::size_t size, std::siz
  * @param devPtr_offset Offset relative to the `devPtr_base` pointer to read into.
  * @return Size of bytes that were successfully read.
  */
-inline std::size_t posix_device_read(int fd,
-                                     const void* devPtr_base,
-                                     std::size_t size,
-                                     std::size_t file_offset,
-                                     std::size_t devPtr_offset)
-{
-  KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size);
-  return detail::posix_device_io<IOOperationType::READ>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
-}
+std::size_t posix_device_read(int fd,
+                              const void* devPtr_base,
+                              std::size_t size,
+                              std::size_t file_offset,
+                              std::size_t devPtr_offset);
 
 /**
  * @brief Write device memory to disk using POSIX
@@ -275,15 +248,10 @@ inline std::size_t posix_device_read(int fd,
  * @param devPtr_offset Offset relative to the `devPtr_base` pointer to write into.
  * @return Size of bytes that were successfully written.
  */
-inline std::size_t posix_device_write(int fd,
-                                      const void* devPtr_base,
-                                      std::size_t size,
-                                      std::size_t file_offset,
-                                      std::size_t devPtr_offset)
-{
-  KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size);
-  return detail::posix_device_io<IOOperationType::WRITE>(
-    fd, devPtr_base, size, file_offset, devPtr_offset);
-}
+std::size_t posix_device_write(int fd,
+                               const void* devPtr_base,
+                               std::size_t size,
+                               std::size_t file_offset,
+                               std::size_t devPtr_offset);
 
 }  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index e1b152b23c..ff0741ae3f 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,101 +33,6 @@
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
-namespace detail {
-
-/**
- * @brief Bounce buffer in pinned host memory.
- *
- * @note Is not thread-safe.
- */
-class BounceBufferH2D {
-  CUstream _stream;                 // The CUDA stream to use.
-  CUdeviceptr _dev;                 // The output device buffer.
-  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
-  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
-  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
-
- public:
-  /**
-   * @brief Create a bounce buffer for an output device buffer.
-   *
-   * @param stream The CUDA stream used throughout the lifetime of the bounce buffer.
-   * @param device_buffer The output device buffer (final destination of the data).
-   */
-  BounceBufferH2D(CUstream stream, void* device_buffer)
-    : _stream{stream},
-      _dev{convert_void2deviceptr(device_buffer)},
-      _host_buffer{AllocRetain::instance().get()}
-  {
-  }
-
-  /**
-   * @brief The bounce buffer if flushed to device on destruction.
-   */
-  ~BounceBufferH2D() noexcept
-  {
-    try {
-      flush();
-    } catch (CUfileException const& e) {
-      std::cerr << "BounceBufferH2D error on final flush: ";
-      std::cerr << e.what();
-      std::cerr << std::endl;
-    }
-  }
-
- private:
-  /**
-   * @brief Write host memory to the output device buffer.
-   *
-   * @param src The host memory source.
-   * @param size Number of bytes to write.
-   */
-  void write_to_device(void const* src, std::size_t size)
-  {
-    if (size > 0) {
-      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream));
-      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
-      _dev_offset += size;
-    }
-  }
-
-  /**
-   * @brief Flush the bounce buffer by writing everything to the output device buffer.
-   */
-  void flush()
-  {
-    write_to_device(_host_buffer.get(), _host_offset);
-    _host_offset = 0;
-  }
-
- public:
-  /**
-   * @brief Write host memory to the bounce buffer (also host memory).
-   *
-   * Only when the bounce buffer has been filled up is data copied to the output device buffer.
-   *
-   * @param data The host memory source.
-   * @param size Number of bytes to write.
-   */
-  void write(char const* data, std::size_t size)
-  {
-    if (_host_buffer.size() - _host_offset < size) {  // Not enough space left in the bounce buffer
-      flush();
-      assert(_host_offset == 0);
-    }
-    if (_host_buffer.size() < size) {
-      // If still not enough space, we just copy the data to the device. This only happens when
-      // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance
-      // optimize for this case.
-      write_to_device(data, size);
-    } else if (size > 0) {
-      std::memcpy(_host_buffer.get(_host_offset), data, size);
-      _host_offset += size;
-    }
-  }
-};
-
-}  // namespace detail
 
 class CurlHandle;  // Prototype
 
@@ -173,9 +78,9 @@ class HttpEndpoint : public RemoteEndpoint {
    *
    * @param url The full http url to the remote file.
    */
-  HttpEndpoint(std::string url) : _url{std::move(url)} {}
+  HttpEndpoint(std::string url);
   void setopt(CurlHandle& curl) override;
-  std::string str() const override { return _url; }
+  std::string str() const override;
   ~HttpEndpoint() override = default;
 };
 
@@ -203,17 +108,7 @@ class S3Endpoint : public RemoteEndpoint {
    */
   static std::string unwrap_or_default(std::optional<std::string> aws_arg,
                                        std::string const& env_var,
-                                       std::string const& err_msg = "")
-  {
-    if (aws_arg.has_value()) { return std::move(*aws_arg); }
-
-    char const* env = std::getenv(env_var.c_str());
-    if (env == nullptr) {
-      if (err_msg.empty()) { return std::string(); }
-      throw std::invalid_argument(err_msg);
-    }
-    return std::string(env);
-  }
+                                       std::string const& err_msg = "");
 
  public:
   /**
@@ -234,22 +129,7 @@ class S3Endpoint : public RemoteEndpoint {
   static std::string url_from_bucket_and_object(std::string const& bucket_name,
                                                 std::string const& object_name,
                                                 std::optional<std::string> const& aws_region,
-                                                std::optional<std::string> aws_endpoint_url)
-  {
-    auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
-    std::stringstream ss;
-    if (endpoint_url.empty()) {
-      auto const region =
-        unwrap_or_default(std::move(aws_region),
-                          "AWS_DEFAULT_REGION",
-                          "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-      // We default to the official AWS url scheme.
-      ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
-    } else {
-      ss << endpoint_url << "/" << bucket_name << "/" << object_name;
-    }
-    return ss.str();
-  }
+                                                std::optional<std::string> aws_endpoint_url);
 
   /**
    * @brief Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
@@ -259,14 +139,7 @@ class S3Endpoint : public RemoteEndpoint {
    * @param s3_url S3 url.
    * @return Pair of strings: [bucket-name, object-name].
    */
-  [[nodiscard]] static std::pair<std::string, std::string> parse_s3_url(std::string const& s3_url)
-  {
-    // Regular expression to match s3://<bucket>/<object>
-    std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
-    std::smatch matches;
-    if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
-    throw std::invalid_argument("Input string does not match the expected S3 URL format.");
-  }
+  [[nodiscard]] static std::pair<std::string, std::string> parse_s3_url(std::string const& s3_url);
 
   /**
    * @brief Create a S3 endpoint from a url.
@@ -284,46 +157,7 @@ class S3Endpoint : public RemoteEndpoint {
   S3Endpoint(std::string url,
              std::optional<std::string> aws_region            = std::nullopt,
              std::optional<std::string> aws_access_key        = std::nullopt,
-             std::optional<std::string> aws_secret_access_key = std::nullopt)
-    : _url{std::move(url)}
-  {
-    // Regular expression to match http[s]://
-    std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
-    if (!std::regex_search(_url, pattern)) {
-      throw std::invalid_argument("url must start with http:// or https://");
-    }
-
-    auto const region =
-      unwrap_or_default(std::move(aws_region),
-                        "AWS_DEFAULT_REGION",
-                        "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-
-    auto const access_key =
-      unwrap_or_default(std::move(aws_access_key),
-                        "AWS_ACCESS_KEY_ID",
-                        "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
-
-    auto const secret_access_key = unwrap_or_default(
-      std::move(aws_secret_access_key),
-      "AWS_SECRET_ACCESS_KEY",
-      "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set.");
-
-    // Create the CURLOPT_AWS_SIGV4 option
-    {
-      std::stringstream ss;
-      ss << "aws:amz:" << region << ":s3";
-      _aws_sigv4 = ss.str();
-    }
-    // Create the CURLOPT_USERPWD option
-    // Notice, curl uses `secret_access_key` to generate a AWS V4 signature. It is NOT included
-    // in the http header. See
-    // <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_sigv-create-signed-request.html>
-    {
-      std::stringstream ss;
-      ss << access_key << ":" << secret_access_key;
-      _aws_userpwd = ss.str();
-    }
-  }
+             std::optional<std::string> aws_secret_access_key = std::nullopt);
 
   /**
    * @brief Create a S3 endpoint from a bucket and object name.
@@ -346,17 +180,10 @@ class S3Endpoint : public RemoteEndpoint {
              std::optional<std::string> aws_region            = std::nullopt,
              std::optional<std::string> aws_access_key        = std::nullopt,
              std::optional<std::string> aws_secret_access_key = std::nullopt,
-             std::optional<std::string> aws_endpoint_url      = std::nullopt)
-    : S3Endpoint(url_from_bucket_and_object(
-                   bucket_name, object_name, aws_region, std::move(aws_endpoint_url)),
-                 std::move(aws_region),
-                 std::move(aws_access_key),
-                 std::move(aws_secret_access_key))
-  {
-  }
+             std::optional<std::string> aws_endpoint_url      = std::nullopt);
 
   void setopt(CurlHandle& curl) override;
-  std::string str() const override { return _url; }
+  std::string str() const override;
   ~S3Endpoint() override = default;
 };
 
@@ -375,10 +202,7 @@ class RemoteHandle {
    * @param endpoint Remote endpoint used for subsequent IO.
    * @param nbytes The size of the remote file (in bytes).
    */
-  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
-    : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
-  {
-  }
+  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes);
 
   /**
    * @brief Create a new remote handle from an endpoint (infers the file size).
@@ -402,14 +226,14 @@ class RemoteHandle {
    *
    * @return The number of bytes.
    */
-  [[nodiscard]] std::size_t nbytes() const noexcept { return _nbytes; }
+  std::size_t nbytes() const noexcept;
 
   /**
    * @brief Get a const reference to the underlying remote endpoint.
    *
    * @return The remote endpoint.
    */
-  [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept { return *_endpoint; }
+  [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept;
 
   /**
    * @brief Read from remote source into buffer (host or device memory).
diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp
index 9eb9942b7a..4b3e37a980 100644
--- a/cpp/include/kvikio/stream.hpp
+++ b/cpp/include/kvikio/stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,11 @@
 
 #include <sys/types.h>
 #include <cstdlib>
-#include <iostream>
-#include <kvikio/error.hpp>
-#include <kvikio/shim/cuda.hpp>
-#include <kvikio/shim/cufile.hpp>
 #include <tuple>
 #include <utility>
 
+#include <kvikio/shim/cuda.hpp>
+
 namespace kvikio {
 
 /**
@@ -63,38 +61,15 @@ class StreamFuture {
   StreamFuture() noexcept = default;
 
   StreamFuture(
-    void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream)
-    : _devPtr_base{devPtr_base}, _stream{stream}
-  {
-    // Notice, we allocate the arguments using malloc() as specified in the cuFile docs:
-    // <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufilewriteasync>
-    if ((_val = static_cast<ArgByVal*>(std::malloc(sizeof(ArgByVal)))) == nullptr) {
-      throw std::bad_alloc{};
-    }
-    *_val = {
-      .size = size, .file_offset = file_offset, .devPtr_offset = devPtr_offset, .bytes_done = 0};
-  }
+    void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream);
 
   /**
    * @brief StreamFuture support move semantic but isn't copyable
    */
   StreamFuture(const StreamFuture&)        = delete;
   StreamFuture& operator=(StreamFuture& o) = delete;
-  StreamFuture(StreamFuture&& o) noexcept
-    : _devPtr_base{std::exchange(o._devPtr_base, nullptr)},
-      _stream{std::exchange(o._stream, nullptr)},
-      _val{std::exchange(o._val, nullptr)},
-      _stream_synchronized{o._stream_synchronized}
-  {
-  }
-  StreamFuture& operator=(StreamFuture&& o) noexcept
-  {
-    _devPtr_base         = std::exchange(o._devPtr_base, nullptr);
-    _stream              = std::exchange(o._stream, nullptr);
-    _val                 = std::exchange(o._val, nullptr);
-    _stream_synchronized = o._stream_synchronized;
-    return *this;
-  }
+  StreamFuture(StreamFuture&& o) noexcept;
+  StreamFuture& operator=(StreamFuture&& o) noexcept;
 
   /**
    * @brief Return the arguments of the future call
@@ -102,18 +77,7 @@ class StreamFuture {
    * @return Tuple of the arguments in the order matching `FileHandle.read()` and
    * `FileHandle.write()`
    */
-  std::tuple<void*, std::size_t*, off_t*, off_t*, ssize_t*, CUstream> get_args() const
-  {
-    if (_val == nullptr) {
-      throw kvikio::CUfileException("cannot get arguments from an uninitialized StreamFuture");
-    }
-    return {_devPtr_base,
-            &_val->size,
-            &_val->file_offset,
-            &_val->devPtr_offset,
-            &_val->bytes_done,
-            _stream};
-  }
+  std::tuple<void*, std::size_t*, off_t*, off_t*, ssize_t*, CUstream> get_args() const;
 
   /**
    * @brief Return the number of bytes read or written by the future operation.
@@ -122,38 +86,13 @@ class StreamFuture {
    *
    * @return Number of bytes read or written by the future operation.
    */
-  std::size_t check_bytes_done()
-  {
-    if (_val == nullptr) {
-      throw kvikio::CUfileException("cannot check bytes done on an uninitialized StreamFuture");
-    }
-
-    if (!_stream_synchronized) {
-      _stream_synchronized = true;
-      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
-    }
-
-    CUFILE_CHECK_BYTES_DONE(_val->bytes_done);
-    // At this point, we know `_val->bytes_done` is a positive value otherwise
-    // CUFILE_CHECK_BYTES_DONE() would have raised an exception.
-    return static_cast<std::size_t>(_val->bytes_done);
-  }
+  std::size_t check_bytes_done();
 
   /**
    * @brief Free the by-value arguments and make sure the associated CUDA stream has been
    * synchronized.
    */
-  ~StreamFuture() noexcept
-  {
-    if (_val != nullptr) {
-      try {
-        check_bytes_done();
-      } catch (const kvikio::CUfileException& e) {
-        std::cerr << e.what() << std::endl;
-      }
-      std::free(_val);
-    }
-  }
+  ~StreamFuture() noexcept;
 };
 
 }  // namespace kvikio
diff --git a/cpp/src/batch.cpp b/cpp/src/batch.cpp
new file mode 100644
index 0000000000..8ced70cbd8
--- /dev/null
+++ b/cpp/src/batch.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <ctime>
+#include <utility>
+#include <vector>
+
+#include <kvikio/batch.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+#ifdef KVIKIO_CUFILE_BATCH_API_FOUND
+
+BatchHandle::BatchHandle(int max_num_events) : _initialized{true}, _max_num_events{max_num_events}
+{
+  CUFILE_TRY(cuFileAPI::instance().BatchIOSetUp(&_handle, max_num_events));
+}
+
+BatchHandle::BatchHandle(BatchHandle&& o) noexcept
+  : _initialized{std::exchange(o._initialized, false)},
+    _max_num_events{std::exchange(o._max_num_events, 0)}
+{
+  _handle = std::exchange(o._handle, CUfileBatchHandle_t{});
+}
+
+BatchHandle::~BatchHandle() noexcept { close(); }
+
+bool BatchHandle::closed() const noexcept { return !_initialized; }
+
+void BatchHandle::close() noexcept
+{
+  if (closed()) { return; }
+  _initialized = false;
+
+  cuFileAPI::instance().BatchIODestroy(_handle);
+}
+
+void BatchHandle::submit(const std::vector<BatchOp>& operations)
+{
+  if (convert_size2ssize(operations.size()) > _max_num_events) {
+    throw CUfileException("Cannot submit more than the max_num_events)");
+  }
+  std::vector<CUfileIOParams_t> io_batch_params;
+  io_batch_params.reserve(operations.size());
+  for (const auto& op : operations) {
+    if (op.file_handle.is_compat_mode_preferred()) {
+      throw CUfileException("Cannot submit a FileHandle opened in compatibility mode");
+    }
+
+    io_batch_params.push_back(CUfileIOParams_t{.mode   = CUFILE_BATCH,
+                                               .u      = {.batch = {.devPtr_base   = op.devPtr_base,
+                                                                    .file_offset   = op.file_offset,
+                                                                    .devPtr_offset = op.devPtr_offset,
+                                                                    .size          = op.size}},
+                                               .fh     = op.file_handle.handle(),
+                                               .opcode = op.opcode,
+                                               .cookie = nullptr});
+  }
+
+  CUFILE_TRY(cuFileAPI::instance().BatchIOSubmit(
+    _handle, io_batch_params.size(), io_batch_params.data(), 0));
+}
+
+std::vector<CUfileIOEvents_t> BatchHandle::status(unsigned min_nr,
+                                                  unsigned max_nr,
+                                                  struct timespec* timeout)
+{
+  std::vector<CUfileIOEvents_t> ret;
+  ret.resize(_max_num_events);
+  CUFILE_TRY(cuFileAPI::instance().BatchIOGetStatus(_handle, min_nr, &max_nr, &ret[0], timeout));
+  ret.resize(max_nr);
+  return ret;
+}
+
+void BatchHandle::cancel() { CUFILE_TRY(cuFileAPI::instance().BatchIOCancel(_handle)); }
+
+#else
+
+BatchHandle::BatchHandle(int max_num_events)
+{
+  throw CUfileException("BatchHandle requires cuFile's batch API, please build with CUDA v12.1+");
+}
+
+bool BatchHandle::closed() const noexcept { return true; }
+
+void BatchHandle::close() noexcept {}
+
+void BatchHandle::submit(const std::vector<BatchOp>& operations) {}
+
+std::vector<CUfileIOEvents_t> BatchHandle::status(unsigned min_nr,
+                                                  unsigned max_nr,
+                                                  struct timespec* timeout)
+{
+  return std::vector<CUfileIOEvents_t>{};
+}
+
+void BatchHandle::cancel() {}
+
+#endif
+
+}  // namespace kvikio
diff --git a/cpp/src/bounce_buffer.cpp b/cpp/src/bounce_buffer.cpp
new file mode 100644
index 0000000000..65ca1aaa52
--- /dev/null
+++ b/cpp/src/bounce_buffer.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mutex>
+#include <stack>
+
+#include <kvikio/bounce_buffer.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cuda.hpp>
+
+namespace kvikio {
+
+AllocRetain::Alloc::Alloc(AllocRetain* manager, void* alloc, std::size_t size)
+  : _manager(manager), _alloc{alloc}, _size{size}
+{
+}
+
+AllocRetain::Alloc::~Alloc() noexcept { _manager->put(_alloc, _size); }
+
+void* AllocRetain::Alloc::get() noexcept { return _alloc; }
+
+void* AllocRetain::Alloc::get(std::ptrdiff_t offset) noexcept
+{
+  return static_cast<char*>(_alloc) + offset;
+}
+
+std::size_t AllocRetain::Alloc::size() noexcept { return _size; }
+
+std::size_t AllocRetain::_clear()
+{
+  std::size_t ret = _free_allocs.size() * _size;
+  while (!_free_allocs.empty()) {
+    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top()));
+    _free_allocs.pop();
+  }
+  return ret;
+}
+
+void AllocRetain::_ensure_alloc_size()
+{
+  auto const bounce_buffer_size = defaults::bounce_buffer_size();
+  if (_size != bounce_buffer_size) {
+    _clear();
+    _size = bounce_buffer_size;
+  }
+}
+
+AllocRetain::Alloc AllocRetain::get()
+{
+  std::lock_guard const lock(_mutex);
+  _ensure_alloc_size();
+
+  // Check if we have an allocation available
+  if (!_free_allocs.empty()) {
+    void* ret = _free_allocs.top();
+    _free_allocs.pop();
+    return Alloc(this, ret, _size);
+  }
+
+  // If no available allocation, allocate and register a new one
+  void* alloc{};
+  // Allocate page-locked host memory
+  CUDA_DRIVER_TRY(cudaAPI::instance().MemHostAlloc(&alloc, _size, CU_MEMHOSTREGISTER_PORTABLE));
+  return Alloc(this, alloc, _size);
+}
+
+void AllocRetain::put(void* alloc, std::size_t size)
+{
+  std::lock_guard const lock(_mutex);
+  _ensure_alloc_size();
+
+  // If the size of `alloc` matches the sizes of the retained allocations,
+  // it is added to the set of free allocation otherwise it is freed.
+  if (size == _size) {
+    _free_allocs.push(alloc);
+  } else {
+    CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(alloc));
+  }
+}
+
+std::size_t AllocRetain::clear()
+{
+  std::lock_guard const lock(_mutex);
+  return _clear();
+}
+
+AllocRetain& AllocRetain::instance()
+{
+  static AllocRetain _instance;
+  return _instance;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/buffer.cpp b/cpp/src/buffer.cpp
new file mode 100644
index 0000000000..0aa772d50f
--- /dev/null
+++ b/cpp/src/buffer.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <kvikio/buffer.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cufile.hpp>
+#include <kvikio/shim/cufile_h_wrapper.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+void buffer_register(const void* devPtr_base,
+                     std::size_t size,
+                     int flags,
+                     const std::vector<int>& errors_to_ignore)
+{
+  if (defaults::is_compat_mode_preferred()) { return; }
+  CUfileError_t status = cuFileAPI::instance().BufRegister(devPtr_base, size, flags);
+  if (status.err != CU_FILE_SUCCESS) {
+    // Check if `status.err` is in `errors_to_ignore`
+    if (std::find(errors_to_ignore.begin(), errors_to_ignore.end(), status.err) ==
+        errors_to_ignore.end()) {
+      CUFILE_TRY(status);
+    }
+  }
+}
+
+void buffer_deregister(const void* devPtr_base)
+{
+  if (defaults::is_compat_mode_preferred()) { return; }
+  CUFILE_TRY(cuFileAPI::instance().BufDeregister(devPtr_base));
+}
+
+void memory_register(const void* devPtr, int flags, const std::vector<int>& errors_to_ignore)
+{
+  auto [base, nbytes, offset] = get_alloc_info(devPtr);
+  buffer_register(base, nbytes, flags, errors_to_ignore);
+}
+
+void memory_deregister(const void* devPtr)
+{
+  auto [base, nbytes, offset] = get_alloc_info(devPtr);
+  buffer_deregister(base);
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp
index 7566c11532..2abbf33e92 100644
--- a/cpp/src/cufile/config.cpp
+++ b/cpp/src/cufile/config.cpp
@@ -21,7 +21,7 @@
 #include <kvikio/cufile/config.hpp>
 
 namespace kvikio {
-namespace detail {
+namespace {
 
 [[nodiscard]] inline const char* lookup_config_path()
 {
@@ -31,11 +31,11 @@ namespace detail {
   return "";
 }
 
-}  // namespace detail
+}  // namespace
 
 const std::string& config_path()
 {
-  static const std::string ret = detail::lookup_config_path();
+  static const std::string ret = lookup_config_path();
   return ret;
 }
 
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
index 127050ed06..13a23f547c 100644
--- a/cpp/src/cufile/driver.cpp
+++ b/cpp/src/cufile/driver.cpp
@@ -23,7 +23,7 @@
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 
 namespace kvikio {
-namespace detail {
+namespace {
 
 [[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept
 {
@@ -38,7 +38,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe
     prop &= ~(1U << flag);
   }
 }
-}  // namespace detail
+}  // namespace
 
 #ifdef KVIKIO_CUFILE_FOUND
 
@@ -70,31 +70,31 @@ bool DriverProperties::is_gds_available()
   return !(get_nvfs_major_version() == 0 && get_nvfs_minor_version() == 0);
 }
 
-[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version()
+unsigned int DriverProperties::get_nvfs_major_version()
 {
   lazy_init();
   return _props.nvfs.major_version;
 }
 
-[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version()
+unsigned int DriverProperties::get_nvfs_minor_version()
 {
   lazy_init();
   return _props.nvfs.minor_version;
 }
 
-[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode()
+bool DriverProperties::get_nvfs_allow_compat_mode()
 {
   lazy_init();
-  return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE);
+  return get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE);
 }
 
-[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode()
+bool DriverProperties::get_nvfs_poll_mode()
 {
   lazy_init();
-  return detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE);
+  return get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE);
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size()
+std::size_t DriverProperties::get_nvfs_poll_thresh_size()
 {
   lazy_init();
   return _props.nvfs.poll_thresh_size;
@@ -104,7 +104,7 @@ void DriverProperties::set_nvfs_poll_mode(bool enable)
 {
   lazy_init();
   CUFILE_TRY(cuFileAPI::instance().DriverSetPollMode(enable, get_nvfs_poll_thresh_size()));
-  detail::set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable);
+  set_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE, enable);
 }
 
 void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
@@ -114,20 +114,20 @@ void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
   _props.nvfs.poll_thresh_size = size_in_kb;
 }
 
-[[nodiscard]] std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
+std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
 {
   lazy_init();
   std::vector<CUfileDriverControlFlags> ret;
-  if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) {
+  if (get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_USE_POLL_MODE)) {
     ret.push_back(CU_FILE_USE_POLL_MODE);
   }
-  if (detail::get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) {
+  if (get_driver_flag(_props.nvfs.dcontrolflags, CU_FILE_ALLOW_COMPAT_MODE)) {
     ret.push_back(CU_FILE_ALLOW_COMPAT_MODE);
   }
   return ret;
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size()
+std::size_t DriverProperties::get_max_device_cache_size()
 {
   lazy_init();
   return _props.max_device_cache_size;
@@ -140,13 +140,13 @@ void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
   _props.max_device_cache_size = size_in_kb;
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size()
+std::size_t DriverProperties::get_per_buffer_cache_size()
 {
   lazy_init();
   return _props.per_buffer_cache_size;
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size()
+std::size_t DriverProperties::get_max_pinned_memory_size()
 {
   lazy_init();
   return _props.max_device_pinned_mem_size;
@@ -159,7 +159,7 @@ void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
   _props.max_device_pinned_mem_size = size_in_kb;
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size()
+std::size_t DriverProperties::get_max_batch_io_size()
 {
 #ifdef KVIKIO_CUFILE_BATCH_API_FOUND
   lazy_init();
@@ -176,27 +176,27 @@ DriverProperties::DriverProperties() {}
 
 bool DriverProperties::is_gds_available() { return false; }
 
-[[nodiscard]] unsigned int DriverProperties::get_nvfs_major_version()
+unsigned int DriverProperties::get_nvfs_major_version()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] unsigned int DriverProperties::get_nvfs_minor_version()
+unsigned int DriverProperties::get_nvfs_minor_version()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] bool DriverProperties::get_nvfs_allow_compat_mode()
+bool DriverProperties::get_nvfs_allow_compat_mode()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] bool DriverProperties::get_nvfs_poll_mode()
+bool DriverProperties::get_nvfs_poll_mode()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_nvfs_poll_thresh_size()
+std::size_t DriverProperties::get_nvfs_poll_thresh_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
@@ -211,12 +211,12 @@ void DriverProperties::set_nvfs_poll_thresh_size(std::size_t size_in_kb)
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
+std::vector<CUfileDriverControlFlags> DriverProperties::get_nvfs_statusflags()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_device_cache_size()
+std::size_t DriverProperties::get_max_device_cache_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
@@ -226,12 +226,12 @@ void DriverProperties::set_max_device_cache_size(std::size_t size_in_kb)
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_per_buffer_cache_size()
+std::size_t DriverProperties::get_per_buffer_cache_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_pinned_memory_size()
+std::size_t DriverProperties::get_max_pinned_memory_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
@@ -241,7 +241,7 @@ void DriverProperties::set_max_pinned_memory_size(std::size_t size_in_kb)
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
 
-[[nodiscard]] std::size_t DriverProperties::get_max_batch_io_size()
+std::size_t DriverProperties::get_max_batch_io_size()
 {
   throw CUfileException("KvikIO not compiled with cuFile.h");
 }
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
new file mode 100644
index 0000000000..f249a8b361
--- /dev/null
+++ b/cpp/src/defaults.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <BS_thread_pool.hpp>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+namespace detail {
+CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
+{
+  // Convert to lowercase
+  std::string tmp{compat_mode_str};
+  std::transform(
+    tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); });
+
+  CompatMode res{};
+  if (tmp == "on" || tmp == "true" || tmp == "yes" || tmp == "1") {
+    res = CompatMode::ON;
+  } else if (tmp == "off" || tmp == "false" || tmp == "no" || tmp == "0") {
+    res = CompatMode::OFF;
+  } else if (tmp == "auto") {
+    res = CompatMode::AUTO;
+  } else {
+    throw std::invalid_argument("Unknown compatibility mode: " + std::string{tmp});
+  }
+  return res;
+}
+
+template <>
+inline bool getenv_or(std::string_view env_var_name, bool default_val)
+{
+  const auto* env_val = std::getenv(env_var_name.data());
+  if (env_val == nullptr) { return default_val; }
+  try {
+    // Try parsing `env_var_name` as a integer
+    return static_cast<bool>(std::stoi(env_val));
+  } catch (const std::invalid_argument&) {
+  }
+  // Convert to lowercase
+  std::string str{env_val};
+  // Special considerations regarding the case conversion:
+  // - std::tolower() is not an addressable function. Passing it to std::transform() as
+  //   a function pointer, if the compile turns out successful, causes the program behavior
+  //   "unspecified (possibly ill-formed)", hence the lambda. ::tolower() is addressable
+  //   and does not have this problem, but the following item still applies.
+  // - To avoid UB in std::tolower() or ::tolower(), the character must be cast to unsigned char.
+  std::transform(
+    str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); });
+  // Trim whitespaces
+  std::stringstream trimmer;
+  trimmer << str;
+  str.clear();
+  trimmer >> str;
+  // Match value
+  if (str == "true" || str == "on" || str == "yes") { return true; }
+  if (str == "false" || str == "off" || str == "no") { return false; }
+  throw std::invalid_argument("unknown config value " + std::string{env_var_name} + "=" +
+                              std::string{env_val});
+}
+
+template <>
+inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val)
+{
+  auto* env_val = std::getenv(env_var_name.data());
+  if (env_val == nullptr) { return default_val; }
+  return parse_compat_mode_str(env_val);
+}
+
+}  // namespace detail
+
+unsigned int defaults::get_num_threads_from_env()
+{
+  const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1);
+  if (ret <= 0) {
+    throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero");
+  }
+  return ret;
+}
+
+defaults::defaults()
+{
+  // Determine the default value of `compat_mode`
+  {
+    _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO);
+  }
+  // Determine the default value of `task_size`
+  {
+    const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024);
+    if (env <= 0) {
+      throw std::invalid_argument(
+        "KVIKIO_TASK_SIZE has to be a positive integer greater than zero");
+    }
+    _task_size = env;
+  }
+  // Determine the default value of `gds_threshold`
+  {
+    const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024);
+    if (env < 0) {
+      throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer");
+    }
+    _gds_threshold = env;
+  }
+  // Determine the default value of `bounce_buffer_size`
+  {
+    const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024);
+    if (env <= 0) {
+      throw std::invalid_argument(
+        "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero");
+    }
+    _bounce_buffer_size = env;
+  }
+}
+
+defaults* defaults::instance()
+{
+  static defaults _instance;
+  return &_instance;
+}
+CompatMode defaults::compat_mode() { return instance()->_compat_mode; }
+
+void defaults::compat_mode_reset(CompatMode compat_mode) { instance()->_compat_mode = compat_mode; }
+
+CompatMode defaults::infer_compat_mode_if_auto(CompatMode compat_mode)
+{
+  if (compat_mode == CompatMode::AUTO) {
+    static auto inferred_compat_mode_for_auto = []() -> CompatMode {
+      return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
+    }();
+    return inferred_compat_mode_for_auto;
+  }
+  return compat_mode;
+}
+
+bool defaults::is_compat_mode_preferred(CompatMode compat_mode)
+{
+  return compat_mode == CompatMode::ON ||
+         (compat_mode == CompatMode::AUTO &&
+          defaults::infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
+}
+
+bool defaults::is_compat_mode_preferred() { return is_compat_mode_preferred(compat_mode()); }
+
+BS::thread_pool& defaults::thread_pool() { return instance()->_thread_pool; }
+
+unsigned int defaults::thread_pool_nthreads() { return thread_pool().get_thread_count(); }
+
+void defaults::thread_pool_nthreads_reset(unsigned int nthreads)
+{
+  if (nthreads == 0) {
+    throw std::invalid_argument("number of threads must be a positive integer greater than zero");
+  }
+  thread_pool().reset(nthreads);
+}
+
+std::size_t defaults::task_size() { return instance()->_task_size; }
+
+void defaults::task_size_reset(std::size_t nbytes)
+{
+  if (nbytes == 0) {
+    throw std::invalid_argument("task size must be a positive integer greater than zero");
+  }
+  instance()->_task_size = nbytes;
+}
+
+std::size_t defaults::gds_threshold() { return instance()->_gds_threshold; }
+
+void defaults::gds_threshold_reset(std::size_t nbytes) { instance()->_gds_threshold = nbytes; }
+
+std::size_t defaults::bounce_buffer_size() { return instance()->_bounce_buffer_size; }
+
+void defaults::bounce_buffer_size_reset(std::size_t nbytes)
+{
+  if (nbytes == 0) {
+    throw std::invalid_argument(
+      "size of the bounce buffer must be a positive integer greater than zero");
+  }
+  instance()->_bounce_buffer_size = nbytes;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/error.cpp b/cpp/src/error.cpp
new file mode 100644
index 0000000000..21ce736a65
--- /dev/null
+++ b/cpp/src/error.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <kvikio/error.hpp>
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
index f2b7fa15db..37e0f7729b 100644
--- a/cpp/src/file_handle.cpp
+++ b/cpp/src/file_handle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -163,9 +163,9 @@ FileHandle::FileHandle(const std::string& file_path,
   }
 }
 
-[[nodiscard]] int FileHandle::fd_open_flags() const { return open_flags(_fd_direct_off); }
+int FileHandle::fd_open_flags() const { return open_flags(_fd_direct_off); }
 
-[[nodiscard]] std::size_t FileHandle::nbytes() const
+std::size_t FileHandle::nbytes() const
 {
   if (closed()) { return 0; }
   if (_nbytes == 0) { _nbytes = get_file_size(_fd_direct_off); }
diff --git a/cpp/src/posix_io.cpp b/cpp/src/posix_io.cpp
new file mode 100644
index 0000000000..d4ee2944e5
--- /dev/null
+++ b/cpp/src/posix_io.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+#include <cstddef>
+#include <cstdlib>
+#include <map>
+#include <thread>
+
+#include <kvikio/bounce_buffer.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/shim/cuda.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio::detail {
+
+CUstream StreamsByThread::get(CUcontext ctx, std::thread::id thd_id)
+{
+  static StreamsByThread _instance;
+
+  // If no current context, we return the null/default stream
+  if (ctx == nullptr) { return nullptr; }
+  auto key = std::make_pair(ctx, thd_id);
+
+  // Create a new stream if `ctx` doesn't have one.
+  if (auto search = _instance._streams.find(key); search == _instance._streams.end()) {
+    CUstream stream{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().StreamCreate(&stream, CU_STREAM_DEFAULT));
+    _instance._streams[key] = stream;
+    return stream;
+  } else {
+    return search->second;
+  }
+}
+
+CUstream StreamsByThread::get()
+{
+  CUcontext ctx{nullptr};
+  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
+  return get(ctx, std::this_thread::get_id());
+}
+
+std::size_t posix_device_read(int fd,
+                              const void* devPtr_base,
+                              std::size_t size,
+                              std::size_t file_offset,
+                              std::size_t devPtr_offset)
+{
+  KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size);
+  return detail::posix_device_io<IOOperationType::READ>(
+    fd, devPtr_base, size, file_offset, devPtr_offset);
+}
+
+std::size_t posix_device_write(int fd,
+                               const void* devPtr_base,
+                               std::size_t size,
+                               std::size_t file_offset,
+                               std::size_t devPtr_offset)
+{
+  KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size);
+  return detail::posix_device_io<IOOperationType::WRITE>(
+    fd, devPtr_base, size, file_offset, devPtr_offset);
+}
+
+}  // namespace kvikio::detail
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index adcf56befc..9fd0690891 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,106 @@
 
 namespace kvikio {
 
+namespace {
+
+/**
+ * @brief Bounce buffer in pinned host memory.
+ *
+ * @note Is not thread-safe.
+ */
+class BounceBufferH2D {
+  CUstream _stream;                 // The CUDA stream to use.
+  CUdeviceptr _dev;                 // The output device buffer.
+  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
+  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
+  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
+
+ public:
+  /**
+   * @brief Create a bounce buffer for an output device buffer.
+   *
+   * @param stream The CUDA stream used throughout the lifetime of the bounce buffer.
+   * @param device_buffer The output device buffer (final destination of the data).
+   */
+  BounceBufferH2D(CUstream stream, void* device_buffer)
+    : _stream{stream},
+      _dev{convert_void2deviceptr(device_buffer)},
+      _host_buffer{AllocRetain::instance().get()}
+  {
+  }
+
+  /**
+   * @brief The bounce buffer if flushed to device on destruction.
+   */
+  ~BounceBufferH2D() noexcept
+  {
+    try {
+      flush();
+    } catch (CUfileException const& e) {
+      std::cerr << "BounceBufferH2D error on final flush: ";
+      std::cerr << e.what();
+      std::cerr << std::endl;
+    }
+  }
+
+ private:
+  /**
+   * @brief Write host memory to the output device buffer.
+   *
+   * @param src The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write_to_device(void const* src, std::size_t size)
+  {
+    if (size > 0) {
+      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream));
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
+      _dev_offset += size;
+    }
+  }
+
+  /**
+   * @brief Flush the bounce buffer by writing everything to the output device buffer.
+   */
+  void flush()
+  {
+    write_to_device(_host_buffer.get(), _host_offset);
+    _host_offset = 0;
+  }
+
+ public:
+  /**
+   * @brief Write host memory to the bounce buffer (also host memory).
+   *
+   * Only when the bounce buffer has been filled up is data copied to the output device buffer.
+   *
+   * @param data The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write(char const* data, std::size_t size)
+  {
+    if (_host_buffer.size() - _host_offset < size) {  // Not enough space left in the bounce buffer
+      flush();
+      assert(_host_offset == 0);
+    }
+    if (_host_buffer.size() < size) {
+      // If still not enough space, we just copy the data to the device. This only happens when
+      // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance
+      // optimize for this case.
+      write_to_device(data, size);
+    } else if (size > 0) {
+      std::memcpy(_host_buffer.get(_host_offset), data, size);
+      _host_offset += size;
+    }
+  }
+};
+
+}  // namespace
+
+HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
+
+std::string HttpEndpoint::str() const { return _url; }
+
 void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
 
 void S3Endpoint::setopt(CurlHandle& curl)
@@ -42,6 +142,114 @@ void S3Endpoint::setopt(CurlHandle& curl)
   curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
 }
 
+std::string S3Endpoint::unwrap_or_default(std::optional<std::string> aws_arg,
+                                          std::string const& env_var,
+                                          std::string const& err_msg)
+{
+  if (aws_arg.has_value()) { return std::move(*aws_arg); }
+
+  char const* env = std::getenv(env_var.c_str());
+  if (env == nullptr) {
+    if (err_msg.empty()) { return std::string(); }
+    throw std::invalid_argument(err_msg);
+  }
+  return std::string(env);
+}
+
+std::string S3Endpoint::url_from_bucket_and_object(std::string const& bucket_name,
+                                                   std::string const& object_name,
+                                                   std::optional<std::string> const& aws_region,
+                                                   std::optional<std::string> aws_endpoint_url)
+{
+  auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
+  std::stringstream ss;
+  if (endpoint_url.empty()) {
+    auto const region =
+      unwrap_or_default(std::move(aws_region),
+                        "AWS_DEFAULT_REGION",
+                        "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+    // We default to the official AWS url scheme.
+    ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
+  } else {
+    ss << endpoint_url << "/" << bucket_name << "/" << object_name;
+  }
+  return ss.str();
+}
+
+std::pair<std::string, std::string> S3Endpoint::parse_s3_url(std::string const& s3_url)
+{
+  // Regular expression to match s3://<bucket>/<object>
+  std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+  std::smatch matches;
+  if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
+  throw std::invalid_argument("Input string does not match the expected S3 URL format.");
+}
+
+S3Endpoint::S3Endpoint(std::string url,
+                       std::optional<std::string> aws_region,
+                       std::optional<std::string> aws_access_key,
+                       std::optional<std::string> aws_secret_access_key)
+  : _url{std::move(url)}
+{
+  // Regular expression to match http[s]://
+  std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+  if (!std::regex_search(_url, pattern)) {
+    throw std::invalid_argument("url must start with http:// or https://");
+  }
+
+  auto const region =
+    unwrap_or_default(std::move(aws_region),
+                      "AWS_DEFAULT_REGION",
+                      "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+
+  auto const access_key =
+    unwrap_or_default(std::move(aws_access_key),
+                      "AWS_ACCESS_KEY_ID",
+                      "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
+
+  auto const secret_access_key = unwrap_or_default(
+    std::move(aws_secret_access_key),
+    "AWS_SECRET_ACCESS_KEY",
+    "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set.");
+
+  // Create the CURLOPT_AWS_SIGV4 option
+  {
+    std::stringstream ss;
+    ss << "aws:amz:" << region << ":s3";
+    _aws_sigv4 = ss.str();
+  }
+  // Create the CURLOPT_USERPWD option
+  // Notice, curl uses `secret_access_key` to generate a AWS V4 signature. It is NOT included
+  // in the http header. See
+  // <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_sigv-create-signed-request.html>
+  {
+    std::stringstream ss;
+    ss << access_key << ":" << secret_access_key;
+    _aws_userpwd = ss.str();
+  }
+}
+
+S3Endpoint::S3Endpoint(std::string const& bucket_name,
+                       std::string const& object_name,
+                       std::optional<std::string> aws_region,
+                       std::optional<std::string> aws_access_key,
+                       std::optional<std::string> aws_secret_access_key,
+                       std::optional<std::string> aws_endpoint_url)
+  : S3Endpoint(
+      url_from_bucket_and_object(bucket_name, object_name, aws_region, std::move(aws_endpoint_url)),
+      std::move(aws_region),
+      std::move(aws_access_key),
+      std::move(aws_secret_access_key))
+{
+}
+
+std::string S3Endpoint::str() const { return _url; }
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+{
+}
+
 RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
 {
   auto curl = create_curl_handle();
@@ -60,6 +268,10 @@ RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
   _endpoint = std::move(endpoint);
 }
 
+std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; }
+
+RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; }
+
 namespace {
 
 /**
@@ -74,7 +286,7 @@ struct CallbackContext {
     : buf{static_cast<char*>(buf)}, size{size}, offset{0}, overflow_error{0}
   {
   }
-  detail::BounceBufferH2D* bounce_buffer{nullptr};  // Only used by callback_device_memory
+  BounceBufferH2D* bounce_buffer{nullptr};  // Only used by callback_device_memory
 };
 
 /**
@@ -166,7 +378,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off
       PushAndPopContext c(get_context_from_pointer(buf));
       // We use a bounce buffer to avoid many small memory copies to device. Libcurl has a
       // maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller.
-      detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf);
+      BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf);
       ctx.bounce_buffer = &bounce_buffer;
       curl.perform();
     }
diff --git a/cpp/src/shim/utils.cpp b/cpp/src/shim/utils.cpp
index ab9afbf648..314fb5382a 100644
--- a/cpp/src/shim/utils.cpp
+++ b/cpp/src/shim/utils.cpp
@@ -45,7 +45,7 @@ void* load_library(const std::vector<const char*>& names, int mode)
   throw std::runtime_error("cannot open shared object file, tried: " + ss.str());
 }
 
-[[nodiscard]] bool is_running_in_wsl()
+bool is_running_in_wsl()
 {
   struct utsname buf {};
   int err = ::uname(&buf);
@@ -57,7 +57,7 @@ void* load_library(const std::vector<const char*>& names, int mode)
   return false;
 }
 
-[[nodiscard]] bool run_udev_readable()
+bool run_udev_readable()
 {
   try {
     return std::filesystem::is_directory("/run/udev");
diff --git a/cpp/src/stream.cpp b/cpp/src/stream.cpp
new file mode 100644
index 0000000000..ac0f8138e4
--- /dev/null
+++ b/cpp/src/stream.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <cstdlib>
+#include <iostream>
+#include <tuple>
+#include <utility>
+
+#include <kvikio/error.hpp>
+#include <kvikio/shim/cuda.hpp>
+#include <kvikio/shim/cufile.hpp>
+#include <kvikio/stream.hpp>
+
+namespace kvikio {
+
+StreamFuture::StreamFuture(
+  void* devPtr_base, std::size_t size, off_t file_offset, off_t devPtr_offset, CUstream stream)
+  : _devPtr_base{devPtr_base}, _stream{stream}
+{
+  // Notice, we allocate the arguments using malloc() as specified in the cuFile docs:
+  // <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufilewriteasync>
+  if ((_val = static_cast<ArgByVal*>(std::malloc(sizeof(ArgByVal)))) == nullptr) {
+    throw std::bad_alloc{};
+  }
+  *_val = {
+    .size = size, .file_offset = file_offset, .devPtr_offset = devPtr_offset, .bytes_done = 0};
+}
+
+StreamFuture::StreamFuture(StreamFuture&& o) noexcept
+  : _devPtr_base{std::exchange(o._devPtr_base, nullptr)},
+    _stream{std::exchange(o._stream, nullptr)},
+    _val{std::exchange(o._val, nullptr)},
+    _stream_synchronized{o._stream_synchronized}
+{
+}
+
+StreamFuture& StreamFuture::operator=(StreamFuture&& o) noexcept
+{
+  _devPtr_base         = std::exchange(o._devPtr_base, nullptr);
+  _stream              = std::exchange(o._stream, nullptr);
+  _val                 = std::exchange(o._val, nullptr);
+  _stream_synchronized = o._stream_synchronized;
+  return *this;
+}
+
+std::tuple<void*, std::size_t*, off_t*, off_t*, ssize_t*, CUstream> StreamFuture::get_args() const
+{
+  if (_val == nullptr) {
+    throw kvikio::CUfileException("cannot get arguments from an uninitialized StreamFuture");
+  }
+  return {_devPtr_base,
+          &_val->size,
+          &_val->file_offset,
+          &_val->devPtr_offset,
+          &_val->bytes_done,
+          _stream};
+}
+
+std::size_t StreamFuture::check_bytes_done()
+{
+  if (_val == nullptr) {
+    throw kvikio::CUfileException("cannot check bytes done on an uninitialized StreamFuture");
+  }
+
+  if (!_stream_synchronized) {
+    _stream_synchronized = true;
+    CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
+  }
+
+  CUFILE_CHECK_BYTES_DONE(_val->bytes_done);
+  // At this point, we know `_val->bytes_done` is a positive value otherwise
+  // CUFILE_CHECK_BYTES_DONE() would have raised an exception.
+  return static_cast<std::size_t>(_val->bytes_done);
+}
+
+StreamFuture::~StreamFuture() noexcept
+{
+  if (_val != nullptr) {
+    try {
+      check_bytes_done();
+    } catch (const kvikio::CUfileException& e) {
+      std::cerr << e.what() << std::endl;
+    }
+    std::free(_val);
+  }
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp
index 32834cf3a4..4ba5a757a2 100644
--- a/cpp/src/utils.cpp
+++ b/cpp/src/utils.cpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include <cstring>
 #include <future>
@@ -163,9 +162,7 @@ PushAndPopContext::~PushAndPopContext()
   }
 }
 
-// Find the base and offset of the memory allocation `devPtr` is in
-std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
-                                                           CUcontext* ctx = nullptr)
+std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr, CUcontext* ctx)
 {
   auto dev = convert_void2deviceptr(devPtr);
   CUdeviceptr base_ptr{};

From 421c46d5a49dbe8fb8aba9385cdd649faba0dec2 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 9 Jan 2025 09:16:53 -0500
Subject: [PATCH 5/8] Minor fixes

---
 cpp/include/kvikio/defaults.hpp      | 2 +-
 cpp/include/kvikio/remote_handle.hpp | 2 +-
 cpp/include/kvikio/shim/cuda.hpp     | 1 -
 cpp/include/kvikio/utils.hpp         | 2 +-
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index 4c87724445..c65b1fa0bf 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index ff0741ae3f..17db472825 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -226,7 +226,7 @@ class RemoteHandle {
    *
    * @return The number of bytes.
    */
-  std::size_t nbytes() const noexcept;
+  [[nodiscard]] std::size_t nbytes() const noexcept;
 
   /**
    * @brief Get a const reference to the underlying remote endpoint.
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index f868d40f58..9aaac08827 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <kvikio/shim/cuda.hpp>
 #include <kvikio/shim/cuda_h_wrapper.hpp>
 #include <kvikio/shim/utils.hpp>
 
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 5593c6c9de..09d10cbcae 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -150,7 +150,7 @@ std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* devPtr,
                                                            CUcontext* ctx = nullptr);
 
 template <typename T>
-inline bool is_future_done(const T& future)
+bool is_future_done(const T& future)
 {
   return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout;
 }

From 07ad349b66012766cbc8d961503f5f1110886cea Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 9 Jan 2025 10:29:27 -0500
Subject: [PATCH 6/8] Fix the preamble date

---
 cpp/include/kvikio/cufile/config.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/cufile/config.hpp b/cpp/include/kvikio/cufile/config.hpp
index 7dd9ee7bcb..5d48106c9b 100644
--- a/cpp/include/kvikio/cufile/config.hpp
+++ b/cpp/include/kvikio/cufile/config.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 688ceed59e9f93445b58cb776fc86aeba623a6b4 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 9 Jan 2025 12:17:20 -0500
Subject: [PATCH 7/8] Remove inline. Move getenv_or outside the detail
 namespace. Add noexcept to dtor

---
 cpp/include/kvikio/cufile/driver.hpp |  2 +-
 cpp/include/kvikio/defaults.hpp      |  4 ++--
 cpp/include/kvikio/utils.hpp         |  2 +-
 cpp/src/cufile/config.cpp            |  2 +-
 cpp/src/cufile/driver.cpp            |  6 +++---
 cpp/src/defaults.cpp                 | 20 ++++++++++----------
 cpp/src/remote_handle.cpp            | 10 ++--------
 7 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/cpp/include/kvikio/cufile/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
index 4a33289bda..56a6e8159b 100644
--- a/cpp/include/kvikio/cufile/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -34,7 +34,7 @@ class DriverInitializer {
   DriverInitializer(DriverInitializer&&) noexcept            = delete;
   DriverInitializer& operator=(DriverInitializer&&) noexcept = delete;
 
-  ~DriverInitializer();
+  ~DriverInitializer() noexcept;
 };
 
 class DriverProperties {
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index c65b1fa0bf..ad8b5a3e40 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -57,6 +57,8 @@ namespace detail {
  */
 CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
+}  // namespace detail
+
 template <typename T>
 T getenv_or(std::string_view env_var_name, T default_val)
 {
@@ -79,8 +81,6 @@ bool getenv_or(std::string_view env_var_name, bool default_val);
 template <>
 CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val);
 
-}  // namespace detail
-
 /**
  * @brief Singleton class of default values used throughout KvikIO.
  *
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 09d10cbcae..5bea6f17c7 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -93,7 +93,7 @@ constexpr bool is_host_memory(const void* ptr) { return true; }
  * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
  * @return Primary CUDA context
  */
-[[nodiscard]] KVIKIO_EXPORT inline CUcontext get_primary_cuda_context(int ordinal);
+[[nodiscard]] KVIKIO_EXPORT CUcontext get_primary_cuda_context(int ordinal);
 
 /**
  * @brief Return the CUDA context associated the given device pointer, if any.
diff --git a/cpp/src/cufile/config.cpp b/cpp/src/cufile/config.cpp
index 2abbf33e92..b27475b8da 100644
--- a/cpp/src/cufile/config.cpp
+++ b/cpp/src/cufile/config.cpp
@@ -23,7 +23,7 @@
 namespace kvikio {
 namespace {
 
-[[nodiscard]] inline const char* lookup_config_path()
+[[nodiscard]] const char* lookup_config_path()
 {
   const char* env = std::getenv("CUFILE_ENV_PATH_JSON");
   if (env != nullptr && std::filesystem::exists(env)) { return env; }
diff --git a/cpp/src/cufile/driver.cpp b/cpp/src/cufile/driver.cpp
index 13a23f547c..ffee213c00 100644
--- a/cpp/src/cufile/driver.cpp
+++ b/cpp/src/cufile/driver.cpp
@@ -25,12 +25,12 @@
 namespace kvikio {
 namespace {
 
-[[nodiscard]] inline bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept
+[[nodiscard]] bool get_driver_flag(unsigned int prop, unsigned int flag) noexcept
 {
   return (prop & (1U << flag)) != 0;
 }
 
-inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept
+void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noexcept
 {
   if (val) {
     prop |= (1U << flag);
@@ -44,7 +44,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe
 
 DriverInitializer::DriverInitializer() { cuFileAPI::instance().driver_open(); }
 
-DriverInitializer::~DriverInitializer()
+DriverInitializer::~DriverInitializer() noexcept
 {
   try {
     cuFileAPI::instance().driver_close();
diff --git a/cpp/src/defaults.cpp b/cpp/src/defaults.cpp
index f249a8b361..affcac6be3 100644
--- a/cpp/src/defaults.cpp
+++ b/cpp/src/defaults.cpp
@@ -49,8 +49,10 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
   return res;
 }
 
+}  // namespace detail
+
 template <>
-inline bool getenv_or(std::string_view env_var_name, bool default_val)
+bool getenv_or(std::string_view env_var_name, bool default_val)
 {
   const auto* env_val = std::getenv(env_var_name.data());
   if (env_val == nullptr) { return default_val; }
@@ -82,18 +84,16 @@ inline bool getenv_or(std::string_view env_var_name, bool default_val)
 }
 
 template <>
-inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val)
+CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val)
 {
   auto* env_val = std::getenv(env_var_name.data());
   if (env_val == nullptr) { return default_val; }
-  return parse_compat_mode_str(env_val);
+  return detail::parse_compat_mode_str(env_val);
 }
 
-}  // namespace detail
-
 unsigned int defaults::get_num_threads_from_env()
 {
-  const int ret = detail::getenv_or("KVIKIO_NTHREADS", 1);
+  const int ret = getenv_or("KVIKIO_NTHREADS", 1);
   if (ret <= 0) {
     throw std::invalid_argument("KVIKIO_NTHREADS has to be a positive integer greater than zero");
   }
@@ -104,11 +104,11 @@ defaults::defaults()
 {
   // Determine the default value of `compat_mode`
   {
-    _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO);
+    _compat_mode = getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO);
   }
   // Determine the default value of `task_size`
   {
-    const ssize_t env = detail::getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024);
+    const ssize_t env = getenv_or("KVIKIO_TASK_SIZE", 4 * 1024 * 1024);
     if (env <= 0) {
       throw std::invalid_argument(
         "KVIKIO_TASK_SIZE has to be a positive integer greater than zero");
@@ -117,7 +117,7 @@ defaults::defaults()
   }
   // Determine the default value of `gds_threshold`
   {
-    const ssize_t env = detail::getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024);
+    const ssize_t env = getenv_or("KVIKIO_GDS_THRESHOLD", 1024 * 1024);
     if (env < 0) {
       throw std::invalid_argument("KVIKIO_GDS_THRESHOLD has to be a positive integer");
     }
@@ -125,7 +125,7 @@ defaults::defaults()
   }
   // Determine the default value of `bounce_buffer_size`
   {
-    const ssize_t env = detail::getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024);
+    const ssize_t env = getenv_or("KVIKIO_BOUNCE_BUFFER_SIZE", 16 * 1024 * 1024);
     if (env <= 0) {
       throw std::invalid_argument(
         "KVIKIO_BOUNCE_BUFFER_SIZE has to be a positive integer greater than zero");
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 9fd0690891..8ca04f94ed 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -299,10 +299,7 @@ struct CallbackContext {
  * @param nmemb Size of the data in `nmemb`.
  * @param context A pointer to an instance of `CallbackContext`.
  */
-inline std::size_t callback_host_memory(char* data,
-                                        std::size_t size,
-                                        std::size_t nmemb,
-                                        void* context)
+std::size_t callback_host_memory(char* data, std::size_t size, std::size_t nmemb, void* context)
 {
   auto ctx                 = reinterpret_cast<CallbackContext*>(context);
   std::size_t const nbytes = size * nmemb;
@@ -326,10 +323,7 @@ inline std::size_t callback_host_memory(char* data,
  * @param nmemb Size of the data in `nmemb`.
  * @param context A pointer to an instance of `CallbackContext`.
  */
-inline std::size_t callback_device_memory(char* data,
-                                          std::size_t size,
-                                          std::size_t nmemb,
-                                          void* context)
+std::size_t callback_device_memory(char* data, std::size_t size, std::size_t nmemb, void* context)
 {
   auto ctx                 = reinterpret_cast<CallbackContext*>(context);
   std::size_t const nbytes = size * nmemb;

From 2700acedf86d9b01b0feafc75466bf7259073c71 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 10 Jan 2025 15:17:44 -0500
Subject: [PATCH 8/8] Improve code quality of error.hpp

---
 cpp/include/kvikio/error.hpp | 106 ++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 44 deletions(-)

diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index 2ecd37b0b3..33a4730b79 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,26 +33,9 @@ struct CUfileException : public std::runtime_error {
   GET_CUDA_DRIVER_TRY_MACRO(__VA_ARGS__, CUDA_DRIVER_TRY_2, CUDA_DRIVER_TRY_1) \
   (__VA_ARGS__)
 #define GET_CUDA_DRIVER_TRY_MACRO(_1, _2, NAME, ...) NAME
-#define CUDA_DRIVER_TRY_2(_call, _exception_type)                                              \
-  do {                                                                                         \
-    CUresult const error = (_call);                                                            \
-    if (error == CUDA_ERROR_STUB_LIBRARY) {                                                    \
-      throw(_exception_type){std::string{"CUDA error at: "} + __FILE__ + ":" +                 \
-                             KVIKIO_STRINGIFY(__LINE__) +                                      \
-                             ": CUDA_ERROR_STUB_LIBRARY("                                      \
-                             "The CUDA driver loaded is a stub library)"};                     \
-    }                                                                                          \
-    if (error != CUDA_SUCCESS) {                                                               \
-      const char* err_name     = nullptr;                                                      \
-      const char* err_str      = nullptr;                                                      \
-      CUresult err_name_status = kvikio::cudaAPI::instance().GetErrorName(error, &err_name);   \
-      CUresult err_str_status  = kvikio::cudaAPI::instance().GetErrorString(error, &err_str);  \
-      if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; }               \
-      if (err_str_status == CUDA_ERROR_INVALID_VALUE) { err_str = "unknown"; }                 \
-      throw(_exception_type){std::string{"CUDA error at: "} + __FILE__ + ":" +                 \
-                             KVIKIO_STRINGIFY(__LINE__) + ": " + std::string(err_name) + "(" + \
-                             std::string(err_str) + ")"};                                      \
-    }                                                                                          \
+#define CUDA_DRIVER_TRY_2(_call, _exception_type)                                  \
+  do {                                                                             \
+    kvikio::detail::cuda_driver_try_2<_exception_type>(_call, __LINE__, __FILE__); \
   } while (0)
 #define CUDA_DRIVER_TRY_1(_call) CUDA_DRIVER_TRY_2(_call, kvikio::CUfileException)
 #endif
@@ -62,18 +45,9 @@ struct CUfileException : public std::runtime_error {
   GET_CUFILE_TRY_MACRO(__VA_ARGS__, CUFILE_TRY_2, CUFILE_TRY_1) \
   (__VA_ARGS__)
 #define GET_CUFILE_TRY_MACRO(_1, _2, NAME, ...) NAME
-#define CUFILE_TRY_2(_call, _exception_type)                                             \
-  do {                                                                                   \
-    CUfileError_t const error = (_call);                                                 \
-    if (error.err != CU_FILE_SUCCESS) {                                                  \
-      if (error.err == CU_FILE_CUDA_DRIVER_ERROR) {                                      \
-        CUresult const cuda_error = error.cu_err;                                        \
-        CUDA_DRIVER_TRY(cuda_error);                                                     \
-      }                                                                                  \
-      throw(_exception_type){std::string{"cuFile error at: "} + __FILE__ + ":" +         \
-                             KVIKIO_STRINGIFY(__LINE__) + ": " +                         \
-                             cufileop_status_error((CUfileOpError)std::abs(error.err))}; \
-    }                                                                                    \
+#define CUFILE_TRY_2(_call, _exception_type)                                  \
+  do {                                                                        \
+    kvikio::detail::cufile_try_2<_exception_type>(_call, __LINE__, __FILE__); \
   } while (0)
 #define CUFILE_TRY_1(_call) CUFILE_TRY_2(_call, kvikio::CUfileException)
 #endif
@@ -84,19 +58,63 @@ struct CUfileException : public std::runtime_error {
     __VA_ARGS__, CUFILE_CHECK_BYTES_DONE_2, CUFILE_CHECK_BYTES_DONE_1) \
   (__VA_ARGS__)
 #define GET_CUFILE_CHECK_BYTES_DONE_MACRO(_1, _2, NAME, ...) NAME
-#define CUFILE_CHECK_BYTES_DONE_2(_nbytes_done, _exception_type)                  \
-  do {                                                                            \
-    auto const _nbytes = (_nbytes_done);                                          \
-    if (_nbytes < 0) {                                                            \
-      auto const err = std::abs(_nbytes);                                         \
-      auto const msg = (err > CUFILEOP_BASE_ERR)                                  \
-                         ? std::string(cufileop_status_error((CUfileOpError)err)) \
-                         : std::string(std::strerror(err));                       \
-      throw(_exception_type){std::string{"cuFile error at: "} + __FILE__ + ":" +  \
-                             KVIKIO_STRINGIFY(__LINE__) + ": " + msg};            \
-    }                                                                             \
+#define CUFILE_CHECK_BYTES_DONE_2(_nbytes_done, _exception_type)                                  \
+  do {                                                                                            \
+    kvikio::detail::cufile_check_bytes_done_2<_exception_type>(_nbytes_done, __LINE__, __FILE__); \
   } while (0)
 #define CUFILE_CHECK_BYTES_DONE_1(_call) CUFILE_CHECK_BYTES_DONE_2(_call, kvikio::CUfileException)
 #endif
 
+namespace detail {
+template <typename Exception>
+void cuda_driver_try_2(CUresult error, int line_number, const char* filename)
+{
+  if (error == CUDA_ERROR_STUB_LIBRARY) {
+    throw Exception{std::string{"CUDA error at: "} + std::string(filename) + ":" +
+                    KVIKIO_STRINGIFY(line_number) +
+                    ": CUDA_ERROR_STUB_LIBRARY("
+                    "The CUDA driver loaded is a stub library)"};
+  }
+  if (error != CUDA_SUCCESS) {
+    const char* err_name     = nullptr;
+    const char* err_str      = nullptr;
+    CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name);
+    CUresult err_str_status  = cudaAPI::instance().GetErrorString(error, &err_str);
+    if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; }
+    if (err_str_status == CUDA_ERROR_INVALID_VALUE) { err_str = "unknown"; }
+    throw Exception{std::string{"CUDA error at: "} + filename + ":" +
+                    KVIKIO_STRINGIFY(line_number) + ": " + std::string(err_name) + "(" +
+                    std::string(err_str) + ")"};
+  }
+}
+
+template <typename Exception>
+void cufile_try_2(CUfileError_t error, int line_number, const char* filename)
+{
+  if (error.err != CU_FILE_SUCCESS) {
+    if (error.err == CU_FILE_CUDA_DRIVER_ERROR) {
+      CUresult const cuda_error = error.cu_err;
+      CUDA_DRIVER_TRY(cuda_error);
+    }
+    throw Exception{std::string{"cuFile error at: "} + filename + ":" +
+                    KVIKIO_STRINGIFY(line_number) + ": " +
+                    cufileop_status_error((CUfileOpError)std::abs(error.err))};
+  }
+}
+
+template <typename Exception>
+void cufile_check_bytes_done_2(ssize_t nbytes_done, int line_number, const char* filename)
+{
+  if (nbytes_done < 0) {
+    auto const err = std::abs(nbytes_done);
+    auto const msg = (err > CUFILEOP_BASE_ERR)
+                       ? std::string(cufileop_status_error((CUfileOpError)err))
+                       : std::string(std::strerror(err));
+    throw Exception{std::string{"cuFile error at: "} + filename + ":" +
+                    KVIKIO_STRINGIFY(line_number) + ": " + msg};
+  }
+}
+
+}  // namespace detail
+
 }  // namespace kvikio