diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8eb52ac7..825f4f78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,7 @@ option(BUILD_WITH_TBB "Build with TBB support" OFF)
 option(BUILD_WITH_OPENMP "Build with OpenMP support" ON)
 option(BUILD_WITH_CUDA "Build with GPU support" OFF)
 option(BUILD_WITH_CUDA_MULTIARCH "Build with CUDA multi-architecture support" OFF)
+option(BUILD_WITH_HIP "Build with ROCm/HIP GPU support (AMD)" OFF)
 option(BUILD_WITH_MARCH_NATIVE "Build with -march=native" OFF)
 option(ENABLE_CPPCHECK "Enable cppcheck" OFF)
 option(ENABLE_COVERAGE "Enable coverage check" OFF)
@@ -112,6 +113,33 @@ if(BUILD_WITH_CUDA)
   set(GTSAM_POINTS_CUDA_VERSION_PATCH ${CUDAToolkit_VERSION_PATCH})
 endif()
 
+# GPU-related (ROCm/HIP for AMD). Mirrors the BUILD_WITH_CUDA arm above: keeps
+# GTSAM_POINTS_USE_CUDA / BUILD_GTSAM_POINTS_GPU defined so all existing GPU
+# guards compile unchanged, and builds the same gtsam_points_cuda source list
+# with the .cu sources marked LANGUAGE HIP (see the GPU library section).
+if(BUILD_WITH_HIP)
+  set(GTSAM_POINTS_USE_CUDA 1)
+  set(GTSAM_POINTS_USE_HIP 1)
+  add_definitions(-DBUILD_GTSAM_POINTS_GPU)
+
+  # Windows/clang needs _USE_MATH_DEFINES for M_PI and NOMINMAX to prevent
+  # the Windows min/max macros from colliding with std::min/max.
+  if(WIN32)
+    add_compile_definitions(_USE_MATH_DEFINES NOMINMAX)
+  endif()
+
+  find_package(hip REQUIRED)
+  # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects
+  # the host GPU and errors on a no-GPU build host.
+  enable_language(HIP)
+  message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
+
+  set(GTSAM_POINTS_CUDA_VERSION 0)
+  set(GTSAM_POINTS_CUDA_VERSION_MAJOR 0)
+  set(GTSAM_POINTS_CUDA_VERSION_MINOR 0)
+  set(GTSAM_POINTS_CUDA_VERSION_PATCH 0)
+endif()
+
 if(ENABLE_CPPCHECK)
   set(CMAKE_CXX_CPPCHECK "cppcheck")
   list(APPEND CMAKE_CXX_CPPCHECK
@@ -150,7 +178,16 @@ configure_file(include/gtsam_points/config.hpp.in include/gtsam_points/config.hp
 ###########
 
 
-add_library(gtsam_points SHARED
+if(WIN32 AND BUILD_WITH_HIP)
+  # On Windows with a static GTSAM (GTSAM_EXPORT = empty, no dllimport), both
+  # gtsam_points and gtsam_points_cuda are built as STATIC to avoid circular DLL
+  # import issues (PointCloudGPU inherits PointCloudCPU across library boundary).
+  # Test executables link all three static libs directly.
+  set(_gtsam_points_type STATIC)
+else()
+  set(_gtsam_points_type SHARED)
+endif()
+add_library(gtsam_points ${_gtsam_points_type}
   # util
   src/gtsam_points/util/parallelism.cpp
   src/gtsam_points/util/bspline.cpp
@@ -227,14 +264,37 @@ target_link_libraries(gtsam_points
   $<TARGET_NAME_IF_EXISTS:TBB::tbb>
   $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
 )
-set_target_properties(gtsam_points PROPERTIES
-  VERSION   ${PROJECT_VERSION}
-  SOVERSION 1
-)
+if(NOT WIN32 OR NOT BUILD_WITH_HIP)
+  set_target_properties(gtsam_points PROPERTIES
+    VERSION   ${PROJECT_VERSION}
+    SOVERSION 1
+  )
+endif()
+
+if(BUILD_WITH_HIP)
+  # The public API uses CUstream_st* etc. The main library's host .cpp call into
+  # the GPU library across that type, so they must see the same HIP opaque-type
+  # bridge (CUstream_st -> ihipStream_t) for name mangling to match. Force-
+  # include the lightweight type bridge (forward decls only, no HIP runtime).
+  target_compile_definitions(gtsam_points PRIVATE USE_HIP)
+  target_compile_options(gtsam_points PRIVATE
+    -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h
+  )
+endif()
 
 # GPU-related
-if(BUILD_WITH_CUDA)
-  add_library(gtsam_points_cuda SHARED
+if(BUILD_WITH_CUDA OR BUILD_WITH_HIP)
+  # On Windows the GPU lib and the main lib have a mutual symbol dependency
+  # (PointCloudGPU inherits PointCloudCPU across the library boundary).
+  # Build as STATIC on Windows/HIP so the GPU objects are absorbed into
+  # gtsam_points.dll at final link, avoiding the circular DLL import.
+  # On Linux/CUDA the existing SHARED build is preserved.
+  if(WIN32 AND BUILD_WITH_HIP)
+    set(_gtsam_points_cuda_type STATIC)
+  else()
+    set(_gtsam_points_cuda_type SHARED)
+  endif()
+  add_library(gtsam_points_cuda ${_gtsam_points_cuda_type}
     # cuda-related
     src/gtsam_points/cuda/check_error.cu
     src/gtsam_points/cuda/check_error_cusolver.cu
@@ -269,8 +329,61 @@ if(BUILD_WITH_CUDA)
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>
   )
+
+  if(BUILD_WITH_HIP)
+    # Compile the .cu sources as HIP. The .cpp sources here are host-only
+    # orchestration (streams + async alloc, no device kernels), so they stay
+    # CXX; both languages get the compat header force-included and the
+    # CUDA-toolkit-named forwarding shims on a PRIVATE+BEFORE include path.
+    set(gtsam_points_cuda_hip_sources
+      src/gtsam_points/cuda/check_error.cu
+      src/gtsam_points/cuda/check_error_cusolver.cu
+      src/gtsam_points/cuda/check_error_curand.cu
+      src/gtsam_points/cuda/cuda_memory.cu
+      src/gtsam_points/cuda/cuda_stream.cu
+      src/gtsam_points/cuda/cuda_buffer.cu
+      src/gtsam_points/cuda/cuda_device_sync.cu
+      src/gtsam_points/cuda/cuda_device_prop.cu
+      src/gtsam_points/cuda/cuda_graph.cu
+      src/gtsam_points/cuda/cuda_graph_exec.cu
+      src/gtsam_points/cuda/stream_roundrobin.cu
+      src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
+      src/gtsam_points/types/point_cloud.cu
+      src/gtsam_points/types/point_cloud_gpu.cu
+      src/gtsam_points/types/gaussian_voxelmap_gpu.cu
+      src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_inliers.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_compute.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_linearize.cu
+      src/gtsam_points/util/easy_profiler_cuda.cu
+    )
+    set_source_files_properties(${gtsam_points_cuda_hip_sources} PROPERTIES LANGUAGE HIP)
+    set_target_properties(gtsam_points_cuda PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+
+    # PRIVATE+BEFORE so <cuda.h>/<cuda_runtime.h>/<device_atomic_functions.h>/
+    # <curand.h>/<cusparse.h> resolve to the forwarding shims on the HIP build
+    # only (the dir is absent on the CUDA build, so the real toolkit wins).
+    target_include_directories(gtsam_points_cuda BEFORE PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat
+    )
+    # Force-include the compat header into every TU (CXX and HIP) so sources
+    # keep their CUDA spelling and the CUstream_st/graph type bridge is in scope.
+    target_compile_options(gtsam_points_cuda PRIVATE
+      -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip.h
+    )
+    target_compile_definitions(gtsam_points_cuda PRIVATE USE_HIP)
+  endif()
+
+  if(BUILD_WITH_CUDA)
+    target_link_libraries(gtsam_points_cuda CUDA::cudart)
+  endif()
+  if(BUILD_WITH_HIP)
+    # hip::host (runtime) only -- NOT hip::device, whose -x hip/--offload-arch in
+    # INTERFACE_COMPILE_OPTIONS would propagate to the host .cpp in this target.
+    target_link_libraries(gtsam_points_cuda hip::host)
+  endif()
   target_link_libraries(gtsam_points_cuda
-    CUDA::cudart
     Boost::boost
     Eigen3::Eigen
     gtsam
@@ -278,10 +391,12 @@ if(BUILD_WITH_CUDA)
     $<TARGET_NAME_IF_EXISTS:TBB::tbb>
     $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
   )
-  set_target_properties(gtsam_points_cuda PROPERTIES
-    VERSION   ${PROJECT_VERSION}
-    SOVERSION 1
-  )
+  if(NOT WIN32 OR NOT BUILD_WITH_HIP)
+    set_target_properties(gtsam_points_cuda PROPERTIES
+      VERSION   ${PROJECT_VERSION}
+      SOVERSION 1
+    )
+  endif()
 
   target_link_libraries(gtsam_points
     gtsam_points_cuda
@@ -348,7 +463,30 @@ if(BUILD_TESTS)
     add_executable(${test_name} ${test_src})
     target_link_libraries(${test_name} gtsam_points gtest_main)
     target_include_directories(${test_name} PRIVATE ${Boost_INCLUDE_DIRS} src/test/include)
-    gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}")
+    if(BUILD_WITH_HIP)
+      # Tests call the GPU API across CUstream_st*; force-include the HIP opaque-
+      # type bridge so those calls match the gtsam_points_cuda definitions.
+      target_compile_definitions(${test_name} PRIVATE USE_HIP)
+      target_compile_options(${test_name} PRIVATE
+        -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h
+      )
+    endif()
+    if(WIN32 AND BUILD_WITH_HIP)
+      # The mixed MSVC/clang toolchain causes duplicate COMDAT symbols when
+      # clang-compiled gtsam_points.lib and MSVC-compiled gtsam.dll both define
+      # the same STL inline/template symbols. Use /FORCE:MULTIPLE to let lld-link
+      # pick one definition and proceed (all definitions are functionally identical).
+      target_link_options(${test_name} PRIVATE "LINKER:/FORCE:MULTIPLE")
+    endif()
+    if(WIN32 AND BUILD_WITH_HIP)
+      # Defer test discovery to run time: build-time discovery requires the exe
+      # to load successfully, but gtsam.dll and ROCm DLLs are not in the exe dir
+      # until the DLL-copy step runs at ctest time.
+      gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        DISCOVERY_MODE PRE_TEST)
+    else()
+      gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}")
+    endif()
   endforeach()
 
   if(BUILD_TESTS_PCL)
@@ -379,7 +517,7 @@ install(
 )
 
 list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points)
-if(BUILD_WITH_CUDA)
+if(BUILD_WITH_CUDA OR BUILD_WITH_HIP)
   list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points_cuda)
 endif()
 
diff --git a/README.md b/README.md
index 746f8cf1..82e508bb 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ Tested on Ubuntu 22.04 / 24.04 and CUDA 12.2 / 12.6 / 13.1, and NVIDIA Jetson Or
     GICP with voxel-based data association and multi-distribution-correspondence [[3]](#VGICP1)[[4]](#VGICP2).
 - **IntegratedVGICPFactorGPU**  
     GPU implementation of VGICP [[3]](#VGICP1)[[4]](#VGICP2).  
-    To enable this factor, set ```-DBUILD_WITH_CUDA=ON```.
+    To enable this factor, set ```-DBUILD_WITH_CUDA=ON``` (NVIDIA) or ```-DBUILD_WITH_HIP=ON``` (AMD ROCm).
 - **IntegratedLOAMFactor**  
     Matching cost factor based on the combination of point-to-plane and point-to-edge distances [[5]](#LOAM)[[6]](#LEGO).
 
@@ -131,9 +131,11 @@ cmake .. -DCMAKE_BUILD_TYPE=Release
 #   -DBUILD_TOOLS=OFF \               # Set ON to build tools
 #   -DBUILD_WITH_TBB=OFF \            # Set ON to enable TBB
 #   -DBUILD_WITH_OPENMP=OFF \         # Set ON to enable OpenMP
-#   -DBUILD_WITH_CUDA=OFF \           # Set ON to enable CUDA support
+#   -DBUILD_WITH_CUDA=OFF \           # Set ON to enable CUDA support (NVIDIA)
 #   -DBUILD_WITH_CUDA_MULTIARCH=OFF \ # Set ON to enable multi-arch CUDA support
 #   -DCMAKE_CUDA_ARCHITECTURES=89 \   # If not specified, "native" architecture is used
+#   -DBUILD_WITH_HIP=OFF \            # Set ON to enable ROCm/HIP support (AMD); requires ROCm
+#   -DCMAKE_HIP_ARCHITECTURES=gfx90a \# Target AMD GPU arch (e.g. gfx90a, gfx1100); defaults to gfx90a if unset
 #   -DBUILD_WITH_MARCH_NATIVE=OFF     # Set ON to enable -march=native (recommended to keep it OFF)
 
 make -j$(nproc)
@@ -208,7 +210,7 @@ This library is released under the MIT license.
 - [GTSAM](https://gtsam.org/)
 - [optional] [PCL]([https://www.openmp.org/](https://pointclouds.org/))
 - [optional] [OpenMP](https://www.openmp.org/)
-- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit)
+- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit) (NVIDIA GPU support) or [ROCm](https://rocm.docs.amd.com/) (AMD GPU support)
 - [optional] [iridescence](https://github.com/koide3/iridescence)
 
 
diff --git a/cmake/gtsam_points-config.cmake.in b/cmake/gtsam_points-config.cmake.in
index 05486054..72b12e60 100644
--- a/cmake/gtsam_points-config.cmake.in
+++ b/cmake/gtsam_points-config.cmake.in
@@ -12,6 +12,7 @@ include_guard()
 set(GTSAM_POINTS_USE_TBB @GTSAM_POINTS_USE_TBB@)
 set(GTSAM_POINTS_USE_OPENMP @GTSAM_POINTS_USE_OPENMP@)
 set(GTSAM_POINTS_USE_CUDA @GTSAM_POINTS_USE_CUDA@)
+set(GTSAM_POINTS_USE_HIP @GTSAM_POINTS_USE_HIP@)
 
 get_filename_component(gtsam_points_CURRENT_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${gtsam_points_CURRENT_CONFIG_DIR}")
@@ -27,7 +28,9 @@ if(GTSAM_POINTS_USE_TBB)
   find_dependency(TBB REQUIRED)
 endif()
 
-if(GTSAM_POINTS_USE_CUDA)
+if(GTSAM_POINTS_USE_HIP)
+  find_dependency(hip REQUIRED)
+elseif(GTSAM_POINTS_USE_CUDA)
   find_dependency(CUDAToolkit REQUIRED)
 endif()
 
diff --git a/hip_compat/cub/device/device_reduce.cuh b/hip_compat/cub/device/device_reduce.cuh
new file mode 100644
index 00000000..319e87c1
--- /dev/null
+++ b/hip_compat/cub/device/device_reduce.cuh
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: maps the CUB include path to hipCUB. The compat header's
+// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the
+// <cub/...> include path to its <hipcub/...> equivalent. HIP include path
+// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins.
+#pragma once
+#include <hipcub/device/device_reduce.hpp>
diff --git a/hip_compat/cub/device/device_select.cuh b/hip_compat/cub/device/device_select.cuh
new file mode 100644
index 00000000..66d30497
--- /dev/null
+++ b/hip_compat/cub/device/device_select.cuh
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: maps the CUB include path to hipCUB. The compat header's
+// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the
+// <cub/...> include path to its <hipcub/...> equivalent. HIP include path
+// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins.
+#pragma once
+#include <hipcub/device/device_select.hpp>
diff --git a/hip_compat/cuda.h b/hip_compat/cuda.h
new file mode 100644
index 00000000..8f6586d9
--- /dev/null
+++ b/hip_compat/cuda.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cuda_runtime.h b/hip_compat/cuda_runtime.h
new file mode 100644
index 00000000..8f6586d9
--- /dev/null
+++ b/hip_compat/cuda_runtime.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cuda_runtime_api.h b/hip_compat/cuda_runtime_api.h
new file mode 100644
index 00000000..8f6586d9
--- /dev/null
+++ b/hip_compat/cuda_runtime_api.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/curand.h b/hip_compat/curand.h
new file mode 100644
index 00000000..28b02810
--- /dev/null
+++ b/hip_compat/curand.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: check_error_curand.cu includes <curand.h> only to name the
+// CURAND_STATUS_* enum values for its diagnostic switch (the GPU library never
+// calls curand). Forward to the compat header, which maps those names to the
+// hipRAND HIPRAND_STATUS_* values. HIP include path (PRIVATE+BEFORE) only.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cusparse.h b/hip_compat/cusparse.h
new file mode 100644
index 00000000..835ed7b8
--- /dev/null
+++ b/hip_compat/cusparse.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: check_error_cusolver.cu includes <cusparse.h> but references
+// no cusparse symbol (its cuSOLVER status switch uses integer literals only),
+// so this just satisfies the include. HIP include path (PRIVATE+BEFORE) only.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/device_atomic_functions.h b/hip_compat/device_atomic_functions.h
new file mode 100644
index 00000000..89ebb0e8
--- /dev/null
+++ b/hip_compat/device_atomic_functions.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header (HIP provides atomicCAS/atomicAdd/atomicMax via the
+// HIP runtime). On the HIP include path (PRIVATE+BEFORE) only; on NVIDIA this
+// directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/include/gtsam_points/cuda/cuda_to_hip.h b/include/gtsam_points/cuda/cuda_to_hip.h
new file mode 100644
index 00000000..d3d5ad2a
--- /dev/null
+++ b/include/gtsam_points/cuda/cuda_to_hip.h
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2021  Kenji Koide (k.koide@aist.go.jp)
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+//
+// ROCm/HIP compatibility shim for the gtsam_points GPU library.
+//
+// This is the only file that knows about HIP. On AMD (USE_HIP) it pulls in the
+// HIP runtime and aliases the CUDA spellings the GPU library actually uses to
+// their HIP equivalents; on NVIDIA it is a plain include of the CUDA runtime.
+// It is force-included into every gtsam_points_cuda translation unit by the
+// build (so source files keep their CUDA spelling and the diff stays small).
+//
+// Authored with the assistance of Claude (Anthropic).
+
+#pragma once
+
+#if defined(USE_HIP)
+
+// libc host string/memory decls must win over HIP's __device__ overloads of
+// memcpy/memset, which become visible once hip_runtime.h is in scope inside a
+// translation unit compiled as HIP (point_cloud_gpu.cu uses host std::memcpy).
+#include <cstring>
+#include <cstdlib>
+
+#include <hip/hip_runtime.h>
+
+// The GPU library uses cudaMallocAsync/cudaFreeAsync (stream-ordered) and the
+// CUDA-12 graph-instantiate / 4-arg add-dependencies arities. CUDA_VERSION is
+// undefined on HIP, which would (a) trip cuda_malloc_async.hpp's
+// `#if (CUDA_VERSION < 11000)` and silently downgrade every async alloc to a
+// synchronous hipMalloc, and (b) leave the cuda_graph.cu arity branch on the
+// wrong side. Pin it into [11000, 13000): hipMallocAsync stays the chosen path
+// and the 4-arg hipGraphAddDependencies / 5-arg hipGraphInstantiate signatures
+// (ROCm 7.x) are selected.
+#ifndef CUDA_VERSION
+#define CUDA_VERSION 12000
+#endif
+
+// Opaque API-type bridge (CUstream_st/CUgraph_st/... -> HIP opaque structs).
+// Kept in a separate header so it is also force-included into the main library,
+// keeping a CUstream_st* signature name-mangling compatible across both libs.
+#include <gtsam_points/cuda/cuda_to_hip_types.h>
+
+// hipCUB provides the cub:: device primitives under the hipcub:: namespace.
+#define cub hipcub
+
+// Error handling / device query
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaMemGetInfo hipMemGetInfo
+
+// Memory (stream-ordered async allocation is used pervasively)
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMallocAsync hipMallocAsync
+#define cudaFreeAsync hipFreeAsync
+#define cudaMallocHost hipHostMalloc
+#define cudaFreeHost hipHostFree
+#define cudaHostRegister hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostRegisterDefault hipHostRegisterDefault
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemcpyHostToHost hipMemcpyHostToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+
+// Streams
+#define cudaStream_t hipStream_t
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaStreamCaptureModeGlobal hipStreamCaptureModeGlobal
+
+// Events (easy_profiler timing)
+#define cudaEvent_t hipEvent_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventDestroy hipEventDestroy
+
+// CUDA Graphs (compile parity; the VGICP gate drives streams + hipCUB directly)
+#define cudaGraphCreate hipGraphCreate
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphAddDependencies hipGraphAddDependencies
+#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+
+// rocThrust exposes its device backend execution policies under thrust::hip
+// (thrust::cuda::par[_nosync] is the CUDA backend and is not valid here).
+// Pull the HIP execution-policy header so thrust::hip is defined, then alias
+// the namespace so the ~12 thrust::cuda::par[_nosync].on(stream) call sites
+// resolve to the HIP backend unchanged. This header is force-included at the
+// top of every TU, so the alias must not depend on the source's own includes.
+// Guard on __HIPCC__: only the .cu sources are compiled -x hip (and use the
+// thrust policy); the host .cpp in the GPU lib are compiled by the CXX compiler
+// (which cannot parse rocThrust/rocPRIM) and never use thrust::cuda::par.
+#if defined(__HIPCC__)
+#include <thrust/system/hip/execution_policy.h>
+namespace thrust {
+namespace cuda = hip;
+}
+#endif
+
+// curand error-name strings (check_error_curand.cu): hipRAND ships the same
+// status names under the HIPRAND_STATUS_* spelling. The GPU library never calls
+// curand at runtime (only this diagnostic switch), so no hipRAND link is needed.
+#include <hiprand/hiprand.h>
+#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
+#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
+#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
+#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
+#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
+#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
+#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
+#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
+#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
+#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
+#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
+#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
+#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
+
+#else
+
+#include <cuda_runtime.h>
+
+#endif
diff --git a/include/gtsam_points/cuda/cuda_to_hip_types.h b/include/gtsam_points/cuda/cuda_to_hip_types.h
new file mode 100644
index 00000000..76dc4f7f
--- /dev/null
+++ b/include/gtsam_points/cuda/cuda_to_hip_types.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2021  Kenji Koide (k.koide@aist.go.jp)
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+//
+// ROCm/HIP opaque-type bridge for the gtsam_points public API.
+//
+// The public headers use the CUDA driver opaque struct pointers (CUstream_st*,
+// CUgraph_st*, ...) as their stream/graph/event types. On HIP those types are
+// the ihip*/hipGraph* opaque structs. This header (force-included into BOTH the
+// main gtsam_points library and the gtsam_points_cuda GPU library under USE_HIP)
+// aliases the CUDA names to the matching HIP opaque structs so a CUstream_st*
+// signature is identical to hipStream_t in every translation unit, keeping the
+// host callers and the .cu definitions name-mangling compatible.
+//
+// It only forward-declares the HIP opaque structs (a pointer type needs no
+// complete type), so it adds no HIP runtime dependency to host C++ files. The
+// full cuda* runtime symbol aliases live in cuda_to_hip.h (GPU library only).
+//
+// Authored with the assistance of Claude (Anthropic).
+
+#pragma once
+
+#if defined(USE_HIP)
+
+struct ihipStream_t;
+struct ihipEvent_t;
+struct ihipGraph;
+struct hipGraphNode;
+struct hipGraphExec;
+
+#define CUstream_st ihipStream_t
+#define CUevent_st ihipEvent_t
+#define CUgraph_st ihipGraph
+#define CUgraphNode_st hipGraphNode
+#define CUgraphExec_st hipGraphExec
+
+#endif
diff --git a/include/gtsam_points/types/point_cloud_cpu.hpp b/include/gtsam_points/types/point_cloud_cpu.hpp
index cd03ca85..a4ad1f8d 100644
--- a/include/gtsam_points/types/point_cloud_cpu.hpp
+++ b/include/gtsam_points/types/point_cloud_cpu.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <numeric>
 #include <vector>
 #include <random>
 #include <Eigen/Core>
diff --git a/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp b/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp
index f234be91..f861e626 100644
--- a/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp
+++ b/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp
@@ -82,20 +82,20 @@ void GaussianVoxelMapCPU::save_compact(const std::string& path) const {
     return GaussianVoxelData(voxel->first.coord, voxel->second);
   });
 
-  std::ofstream ofs(path);
-  ofs << "compact " << 1 << std::endl;
-  ofs << "resolution " << voxel_resolution() << std::endl;
-  ofs << "lru_count " << lru_counter << std::endl;
-  ofs << "lru_cycle " << lru_clear_cycle << std::endl;
-  ofs << "lru_thresh " << lru_horizon << std::endl;
-  ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << std::endl;
-  ofs << "num_voxels " << serial_voxels.size() << std::endl;
+  std::ofstream ofs(path, std::ios::binary);
+  ofs << "compact " << 1 << "\n";
+  ofs << "resolution " << voxel_resolution() << "\n";
+  ofs << "lru_count " << lru_counter << "\n";
+  ofs << "lru_cycle " << lru_clear_cycle << "\n";
+  ofs << "lru_thresh " << lru_horizon << "\n";
+  ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << "\n";
+  ofs << "num_voxels " << serial_voxels.size() << "\n";
 
   ofs.write(reinterpret_cast<const char*>(serial_voxels.data()), sizeof(GaussianVoxelData) * serial_voxels.size());
 }
 
 GaussianVoxelMapCPU::Ptr GaussianVoxelMapCPU::load(const std::string& path) {
-  std::ifstream ifs(path);
+  std::ifstream ifs(path, std::ios::binary);
   if (!ifs) {
     std::cerr << "error: failed to open " << path << std::endl;
     return nullptr;
diff --git a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp
index 8b6c231d..2c3e9950 100644
--- a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp
+++ b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp
@@ -6,6 +6,7 @@
 #include <memory>
 #include <fstream>
 #include <iostream>
+#include <numeric>
 #include <unordered_set>
 
 #include <Eigen/Core>
diff --git a/src/gtsam_points/types/gaussian_voxelmap_gpu.cu b/src/gtsam_points/types/gaussian_voxelmap_gpu.cu
index 69e5b978..8244790b 100644
--- a/src/gtsam_points/types/gaussian_voxelmap_gpu.cu
+++ b/src/gtsam_points/types/gaussian_voxelmap_gpu.cu
@@ -321,6 +321,7 @@ void GaussianVoxelMapGPU::save_compact(const std::string& path) const {
   check_error << cudaMemcpyAsync(h_voxel_means.data(), voxel_means, sizeof(Eigen::Vector3f) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0);
   check_error << cudaMemcpyAsync(h_voxel_covs.data(), voxel_covs, sizeof(Eigen::Matrix3f) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0);
   check_error << cudaMemcpyAsync(h_voxel_intensities.data(), voxel_intensities, sizeof(float) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0);
+  check_error << cudaStreamSynchronize(0);
 
   std::vector<GaussianVoxelData> serial_voxels;
   serial_voxels.reserve(voxelmap_info.num_voxels);
@@ -357,20 +358,20 @@ void GaussianVoxelMapGPU::save_compact(const std::string& path) const {
     serial_voxels.emplace_back(h_voxel_coords[i], voxel);
   }
 
-  std::ofstream ofs(path);
-  ofs << "compact " << 1 << std::endl;
-  ofs << "resolution " << voxel_resolution() << std::endl;
-  ofs << "lru_count " << 0 << std::endl;
-  ofs << "lru_cycle " << 1 << std::endl;
-  ofs << "lru_thresh " << 1 << std::endl;
-  ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << std::endl;
-  ofs << "num_voxels " << serial_voxels.size() << std::endl;
+  std::ofstream ofs(path, std::ios::binary);
+  ofs << "compact " << 1 << "\n";
+  ofs << "resolution " << voxel_resolution() << "\n";
+  ofs << "lru_count " << 0 << "\n";
+  ofs << "lru_cycle " << 1 << "\n";
+  ofs << "lru_thresh " << 1 << "\n";
+  ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << "\n";
+  ofs << "num_voxels " << serial_voxels.size() << "\n";
 
   ofs.write(reinterpret_cast<const char*>(serial_voxels.data()), sizeof(GaussianVoxelData) * serial_voxels.size());
 }
 
 GaussianVoxelMapGPU::Ptr GaussianVoxelMapGPU::load(const std::string& path) {
-  std::ifstream ifs(path);
+  std::ifstream ifs(path, std::ios::binary);
   if (!ifs) {
     std::cerr << "error: failed to open " << path << std::endl;
     return nullptr;
@@ -463,6 +464,8 @@ GaussianVoxelMapGPU::Ptr GaussianVoxelMapGPU::load(const std::string& path) {
   check_error << cudaMemcpyAsync(voxelmap->voxel_means, h_voxel_means.data(), sizeof(Eigen::Vector3f) * num_voxels, cudaMemcpyHostToDevice, 0);
   check_error << cudaMemcpyAsync(voxelmap->voxel_covs, h_voxel_covs.data(), sizeof(Eigen::Matrix3f) * num_voxels, cudaMemcpyHostToDevice, 0);
   check_error << cudaMemcpyAsync(voxelmap->voxel_intensities, h_voxel_intensities.data(), sizeof(float) * num_voxels, cudaMemcpyHostToDevice, 0);
+  // Ensure all H2D copies complete before the source host buffers go out of scope.
+  check_error << cudaStreamSynchronize(0);
   return voxelmap;
 }
 
diff --git a/src/gtsam_points/types/point_cloud_cpu.cpp b/src/gtsam_points/types/point_cloud_cpu.cpp
index 30739753..fb5f4952 100644
--- a/src/gtsam_points/types/point_cloud_cpu.cpp
+++ b/src/gtsam_points/types/point_cloud_cpu.cpp
@@ -267,8 +267,9 @@ PointCloudCPU::Ptr PointCloudCPU::load(const std::string& path) {
   boost::filesystem::directory_iterator end;
   const std::regex aux_name_regex("/aux_([^_]+).bin");
   for (; itr != end; itr++) {
+    const std::string itr_path_str = itr->path().generic_string();
     std::smatch matched;
-    if (!std::regex_search(itr->path().string(), matched, aux_name_regex)) {
+    if (!std::regex_search(itr_path_str, matched, aux_name_regex)) {
       continue;
     }
     const std::string name = matched.str(1);