koide3 · jeffdaily · Jun 11, 2026 · Jun 23, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,6 +18,7 @@ option(BUILD_WITH_TBB "Build with TBB support" OFF)
 option(BUILD_WITH_OPENMP "Build with OpenMP support" ON)
 option(BUILD_WITH_CUDA "Build with GPU support" OFF)
 option(BUILD_WITH_CUDA_MULTIARCH "Build with CUDA multi-architecture support" OFF)
+option(BUILD_WITH_HIP "Build with ROCm/HIP GPU support (AMD)" OFF)
 option(BUILD_WITH_MARCH_NATIVE "Build with -march=native" OFF)
 option(ENABLE_CPPCHECK "Enable cppcheck" OFF)
 option(ENABLE_COVERAGE "Enable coverage check" OFF)
@@ -112,6 +113,33 @@ if(BUILD_WITH_CUDA)
   set(GTSAM_POINTS_CUDA_VERSION_PATCH ${CUDAToolkit_VERSION_PATCH})
 endif()
 
+# GPU-related (ROCm/HIP for AMD). Mirrors the BUILD_WITH_CUDA arm above: keeps
+# GTSAM_POINTS_USE_CUDA / BUILD_GTSAM_POINTS_GPU defined so all existing GPU
+# guards compile unchanged, and builds the same gtsam_points_cuda source list
+# with the .cu sources marked LANGUAGE HIP (see the GPU library section).
+if(BUILD_WITH_HIP)
+  set(GTSAM_POINTS_USE_CUDA 1)
+  set(GTSAM_POINTS_USE_HIP 1)
+  add_definitions(-DBUILD_GTSAM_POINTS_GPU)
+
+  # Windows/clang needs _USE_MATH_DEFINES for M_PI and NOMINMAX to prevent
+  # the Windows min/max macros from colliding with std::min/max.
+  if(WIN32)
+    add_compile_definitions(_USE_MATH_DEFINES NOMINMAX)
+  endif()
+
+  find_package(hip REQUIRED)
+  # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects
+  # the host GPU and errors on a no-GPU build host.
+  enable_language(HIP)
+  message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
+
+  set(GTSAM_POINTS_CUDA_VERSION 0)
+  set(GTSAM_POINTS_CUDA_VERSION_MAJOR 0)
+  set(GTSAM_POINTS_CUDA_VERSION_MINOR 0)
+  set(GTSAM_POINTS_CUDA_VERSION_PATCH 0)
+endif()
+
 if(ENABLE_CPPCHECK)
   set(CMAKE_CXX_CPPCHECK "cppcheck")
   list(APPEND CMAKE_CXX_CPPCHECK
@@ -150,7 +178,16 @@ configure_file(include/gtsam_points/config.hpp.in include/gtsam_points/config.hp
 ###########
 
 
-add_library(gtsam_points SHARED
+if(WIN32 AND BUILD_WITH_HIP)
+  # On Windows with a static GTSAM (GTSAM_EXPORT = empty, no dllimport), both
+  # gtsam_points and gtsam_points_cuda are built as STATIC to avoid circular DLL
+  # import issues (PointCloudGPU inherits PointCloudCPU across library boundary).
+  # Test executables link all three static libs directly.
+  set(_gtsam_points_type STATIC)
+else()
+  set(_gtsam_points_type SHARED)
+endif()
+add_library(gtsam_points ${_gtsam_points_type}
   # util
   src/gtsam_points/util/parallelism.cpp
   src/gtsam_points/util/bspline.cpp
@@ -227,14 +264,37 @@ target_link_libraries(gtsam_points
   $<TARGET_NAME_IF_EXISTS:TBB::tbb>
   $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
 )
-set_target_properties(gtsam_points PROPERTIES
-  VERSION   ${PROJECT_VERSION}
-  SOVERSION 1
-)
+if(NOT WIN32 OR NOT BUILD_WITH_HIP)
+  set_target_properties(gtsam_points PROPERTIES
+    VERSION   ${PROJECT_VERSION}
+    SOVERSION 1
+  )
+endif()
+
+if(BUILD_WITH_HIP)
+  # The public API uses CUstream_st* etc. The main library's host .cpp call into
+  # the GPU library across that type, so they must see the same HIP opaque-type
+  # bridge (CUstream_st -> ihipStream_t) for name mangling to match. Force-
+  # include the lightweight type bridge (forward decls only, no HIP runtime).
+  target_compile_definitions(gtsam_points PRIVATE USE_HIP)
+  target_compile_options(gtsam_points PRIVATE
+    -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h
+  )
+endif()
 
 # GPU-related
-if(BUILD_WITH_CUDA)
-  add_library(gtsam_points_cuda SHARED
+if(BUILD_WITH_CUDA OR BUILD_WITH_HIP)
+  # On Windows the GPU lib and the main lib have a mutual symbol dependency
+  # (PointCloudGPU inherits PointCloudCPU across the library boundary).
+  # Build as STATIC on Windows/HIP so the GPU objects are absorbed into
+  # gtsam_points.dll at final link, avoiding the circular DLL import.
+  # On Linux/CUDA the existing SHARED build is preserved.
+  if(WIN32 AND BUILD_WITH_HIP)
+    set(_gtsam_points_cuda_type STATIC)
+  else()
+    set(_gtsam_points_cuda_type SHARED)
+  endif()
+  add_library(gtsam_points_cuda ${_gtsam_points_cuda_type}
     # cuda-related
     src/gtsam_points/cuda/check_error.cu
     src/gtsam_points/cuda/check_error_cusolver.cu
@@ -269,19 +329,74 @@ if(BUILD_WITH_CUDA)
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>
   )
+
+  if(BUILD_WITH_HIP)
+    # Compile the .cu sources as HIP. The .cpp sources here are host-only
+    # orchestration (streams + async alloc, no device kernels), so they stay
+    # CXX; both languages get the compat header force-included and the
+    # CUDA-toolkit-named forwarding shims on a PRIVATE+BEFORE include path.
+    set(gtsam_points_cuda_hip_sources
+      src/gtsam_points/cuda/check_error.cu
+      src/gtsam_points/cuda/check_error_cusolver.cu
+      src/gtsam_points/cuda/check_error_curand.cu
+      src/gtsam_points/cuda/cuda_memory.cu
+      src/gtsam_points/cuda/cuda_stream.cu
+      src/gtsam_points/cuda/cuda_buffer.cu
+      src/gtsam_points/cuda/cuda_device_sync.cu
+      src/gtsam_points/cuda/cuda_device_prop.cu
+      src/gtsam_points/cuda/cuda_graph.cu
+      src/gtsam_points/cuda/cuda_graph_exec.cu
+      src/gtsam_points/cuda/stream_roundrobin.cu
+      src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
+      src/gtsam_points/types/point_cloud.cu
+      src/gtsam_points/types/point_cloud_gpu.cu
+      src/gtsam_points/types/gaussian_voxelmap_gpu.cu
+      src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_inliers.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_compute.cu
+      src/gtsam_points/factors/integrated_vgicp_derivatives_linearize.cu
+      src/gtsam_points/util/easy_profiler_cuda.cu
+    )
+    set_source_files_properties(${gtsam_points_cuda_hip_sources} PROPERTIES LANGUAGE HIP)
+    set_target_properties(gtsam_points_cuda PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+
+    # PRIVATE+BEFORE so <cuda.h>/<cuda_runtime.h>/<device_atomic_functions.h>/
+    # <curand.h>/<cusparse.h> resolve to the forwarding shims on the HIP build
+    # only (the dir is absent on the CUDA build, so the real toolkit wins).
+    target_include_directories(gtsam_points_cuda BEFORE PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat
+    )
+    # Force-include the compat header into every TU (CXX and HIP) so sources
+    # keep their CUDA spelling and the CUstream_st/graph type bridge is in scope.
+    target_compile_options(gtsam_points_cuda PRIVATE
+      -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip.h
+    )
+    target_compile_definitions(gtsam_points_cuda PRIVATE USE_HIP)
+  endif()
+
+  if(BUILD_WITH_CUDA)
+    target_link_libraries(gtsam_points_cuda CUDA::cudart)
+  endif()
+  if(BUILD_WITH_HIP)
+    # hip::host (runtime) only -- NOT hip::device, whose -x hip/--offload-arch in
+    # INTERFACE_COMPILE_OPTIONS would propagate to the host .cpp in this target.
+    target_link_libraries(gtsam_points_cuda hip::host)
+  endif()
   target_link_libraries(gtsam_points_cuda
-    CUDA::cudart
     Boost::boost
     Eigen3::Eigen
     gtsam
     gtsam_unstable
     $<TARGET_NAME_IF_EXISTS:TBB::tbb>
     $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
   )
-  set_target_properties(gtsam_points_cuda PROPERTIES
-    VERSION   ${PROJECT_VERSION}
-    SOVERSION 1
-  )
+  if(NOT WIN32 OR NOT BUILD_WITH_HIP)
+    set_target_properties(gtsam_points_cuda PROPERTIES
+      VERSION   ${PROJECT_VERSION}
+      SOVERSION 1
+    )
+  endif()
 
   target_link_libraries(gtsam_points
     gtsam_points_cuda
@@ -348,7 +463,30 @@ if(BUILD_TESTS)
     add_executable(${test_name} ${test_src})
     target_link_libraries(${test_name} gtsam_points gtest_main)
     target_include_directories(${test_name} PRIVATE ${Boost_INCLUDE_DIRS} src/test/include)
-    gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}")
+    if(BUILD_WITH_HIP)
+      # Tests call the GPU API across CUstream_st*; force-include the HIP opaque-
+      # type bridge so those calls match the gtsam_points_cuda definitions.
+      target_compile_definitions(${test_name} PRIVATE USE_HIP)
+      target_compile_options(${test_name} PRIVATE
+        -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h
+      )
+    endif()
+    if(WIN32 AND BUILD_WITH_HIP)
+      # The mixed MSVC/clang toolchain causes duplicate COMDAT symbols when
+      # clang-compiled gtsam_points.lib and MSVC-compiled gtsam.dll both define
+      # the same STL inline/template symbols. Use /FORCE:MULTIPLE to let lld-link
+      # pick one definition and proceed (all definitions are functionally identical).
+      target_link_options(${test_name} PRIVATE "LINKER:/FORCE:MULTIPLE")
+    endif()
+    if(WIN32 AND BUILD_WITH_HIP)
+      # Defer test discovery to run time: build-time discovery requires the exe
+      # to load successfully, but gtsam.dll and ROCm DLLs are not in the exe dir
+      # until the DLL-copy step runs at ctest time.
+      gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        DISCOVERY_MODE PRE_TEST)
+    else()
+      gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}")
+    endif()
   endforeach()
 
   if(BUILD_TESTS_PCL)
@@ -379,7 +517,7 @@ install(
 )
 
 list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points)
-if(BUILD_WITH_CUDA)
+if(BUILD_WITH_CUDA OR BUILD_WITH_HIP)
   list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points_cuda)
 endif()
 

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Tested on Ubuntu 22.04 / 24.04 and CUDA 12.2 / 12.6 / 13.1, and NVIDIA Jetson Or
     GICP with voxel-based data association and multi-distribution-correspondence [[3]](#VGICP1)[[4]](#VGICP2).
 - **IntegratedVGICPFactorGPU**  
     GPU implementation of VGICP [[3]](#VGICP1)[[4]](#VGICP2).  
-    To enable this factor, set ```-DBUILD_WITH_CUDA=ON```.
+    To enable this factor, set ```-DBUILD_WITH_CUDA=ON``` (NVIDIA) or ```-DBUILD_WITH_HIP=ON``` (AMD ROCm).
 - **IntegratedLOAMFactor**  
     Matching cost factor based on the combination of point-to-plane and point-to-edge distances [[5]](#LOAM)[[6]](#LEGO).
 
@@ -131,9 +131,11 @@ cmake .. -DCMAKE_BUILD_TYPE=Release
 #   -DBUILD_TOOLS=OFF \               # Set ON to build tools
 #   -DBUILD_WITH_TBB=OFF \            # Set ON to enable TBB
 #   -DBUILD_WITH_OPENMP=OFF \         # Set ON to enable OpenMP
-#   -DBUILD_WITH_CUDA=OFF \           # Set ON to enable CUDA support
+#   -DBUILD_WITH_CUDA=OFF \           # Set ON to enable CUDA support (NVIDIA)
 #   -DBUILD_WITH_CUDA_MULTIARCH=OFF \ # Set ON to enable multi-arch CUDA support
 #   -DCMAKE_CUDA_ARCHITECTURES=89 \   # If not specified, "native" architecture is used
+#   -DBUILD_WITH_HIP=OFF \            # Set ON to enable ROCm/HIP support (AMD); requires ROCm
+#   -DCMAKE_HIP_ARCHITECTURES=gfx90a \# Target AMD GPU arch (e.g. gfx90a, gfx1100); defaults to gfx90a if unset
 #   -DBUILD_WITH_MARCH_NATIVE=OFF     # Set ON to enable -march=native (recommended to keep it OFF)
 
 make -j$(nproc)
@@ -208,7 +210,7 @@ This library is released under the MIT license.
 - [GTSAM](https://gtsam.org/)
 - [optional] [PCL]([https://www.openmp.org/](https://pointclouds.org/))
 - [optional] [OpenMP](https://www.openmp.org/)
-- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit)
+- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit) (NVIDIA GPU support) or [ROCm](https://rocm.docs.amd.com/) (AMD GPU support)
 - [optional] [iridescence](https://github.com/koide3/iridescence)
 
 

diff --git a/cmake/gtsam_points-config.cmake.in b/cmake/gtsam_points-config.cmake.in
@@ -12,6 +12,7 @@ include_guard()
 set(GTSAM_POINTS_USE_TBB @GTSAM_POINTS_USE_TBB@)
 set(GTSAM_POINTS_USE_OPENMP @GTSAM_POINTS_USE_OPENMP@)
 set(GTSAM_POINTS_USE_CUDA @GTSAM_POINTS_USE_CUDA@)
+set(GTSAM_POINTS_USE_HIP @GTSAM_POINTS_USE_HIP@)
 
 get_filename_component(gtsam_points_CURRENT_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${gtsam_points_CURRENT_CONFIG_DIR}")
@@ -27,7 +28,9 @@ if(GTSAM_POINTS_USE_TBB)
   find_dependency(TBB REQUIRED)
 endif()
 
-if(GTSAM_POINTS_USE_CUDA)
+if(GTSAM_POINTS_USE_HIP)
+  find_dependency(hip REQUIRED)
+elseif(GTSAM_POINTS_USE_CUDA)
   find_dependency(CUDAToolkit REQUIRED)
 endif()
 

diff --git a/hip_compat/cub/device/device_reduce.cuh b/hip_compat/cub/device/device_reduce.cuh
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: maps the CUB include path to hipCUB. The compat header's
+// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the
+// <cub/...> include path to its <hipcub/...> equivalent. HIP include path
+// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins.
+#pragma once
+#include <hipcub/device/device_reduce.hpp>
diff --git a/hip_compat/cub/device/device_select.cuh b/hip_compat/cub/device/device_select.cuh
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: maps the CUB include path to hipCUB. The compat header's
+// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the
+// <cub/...> include path to its <hipcub/...> equivalent. HIP include path
+// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins.
+#pragma once
+#include <hipcub/device/device_select.hpp>
diff --git a/hip_compat/cuda.h b/hip_compat/cuda.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cuda_runtime.h b/hip_compat/cuda_runtime.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cuda_runtime_api.h b/hip_compat/cuda_runtime_api.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on
+// NVIDIA this directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/curand.h b/hip_compat/curand.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: check_error_curand.cu includes <curand.h> only to name the
+// CURAND_STATUS_* enum values for its diagnostic switch (the GPU library never
+// calls curand). Forward to the compat header, which maps those names to the
+// hipRAND HIPRAND_STATUS_* values. HIP include path (PRIVATE+BEFORE) only.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/cusparse.h b/hip_compat/cusparse.h
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: check_error_cusolver.cu includes <cusparse.h> but references
+// no cusparse symbol (its cuSOLVER status switch uses integer literals only),
+// so this just satisfies the include. HIP include path (PRIVATE+BEFORE) only.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>
diff --git a/hip_compat/device_atomic_functions.h b/hip_compat/device_atomic_functions.h
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2026  Advanced Micro Devices, Inc. (Jeff Daily <jeff.daily@amd.com>)
+// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points
+// CUDA-to-HIP compat header (HIP provides atomicCAS/atomicAdd/atomicMax via the
+// HIP runtime). On the HIP include path (PRIVATE+BEFORE) only; on NVIDIA this
+// directory is absent so the real toolkit header is used.
+#pragma once
+#include <gtsam_points/cuda/cuda_to_hip.h>