diff --git a/CMakeLists.txt b/CMakeLists.txt index 8eb52ac7..825f4f78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,7 @@ option(BUILD_WITH_TBB "Build with TBB support" OFF) option(BUILD_WITH_OPENMP "Build with OpenMP support" ON) option(BUILD_WITH_CUDA "Build with GPU support" OFF) option(BUILD_WITH_CUDA_MULTIARCH "Build with CUDA multi-architecture support" OFF) +option(BUILD_WITH_HIP "Build with ROCm/HIP GPU support (AMD)" OFF) option(BUILD_WITH_MARCH_NATIVE "Build with -march=native" OFF) option(ENABLE_CPPCHECK "Enable cppcheck" OFF) option(ENABLE_COVERAGE "Enable coverage check" OFF) @@ -112,6 +113,33 @@ if(BUILD_WITH_CUDA) set(GTSAM_POINTS_CUDA_VERSION_PATCH ${CUDAToolkit_VERSION_PATCH}) endif() +# GPU-related (ROCm/HIP for AMD). Mirrors the BUILD_WITH_CUDA arm above: keeps +# GTSAM_POINTS_USE_CUDA / BUILD_GTSAM_POINTS_GPU defined so all existing GPU +# guards compile unchanged, and builds the same gtsam_points_cuda source list +# with the .cu sources marked LANGUAGE HIP (see the GPU library section). +if(BUILD_WITH_HIP) + set(GTSAM_POINTS_USE_CUDA 1) + set(GTSAM_POINTS_USE_HIP 1) + add_definitions(-DBUILD_GTSAM_POINTS_GPU) + + # Windows/clang needs _USE_MATH_DEFINES for M_PI and NOMINMAX to prevent + # the Windows min/max macros from colliding with std::min/max. + if(WIN32) + add_compile_definitions(_USE_MATH_DEFINES NOMINMAX) + endif() + + find_package(hip REQUIRED) + # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects + # the host GPU and errors on a no-GPU build host. + enable_language(HIP) + message(STATUS "CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}") + + set(GTSAM_POINTS_CUDA_VERSION 0) + set(GTSAM_POINTS_CUDA_VERSION_MAJOR 0) + set(GTSAM_POINTS_CUDA_VERSION_MINOR 0) + set(GTSAM_POINTS_CUDA_VERSION_PATCH 0) +endif() + if(ENABLE_CPPCHECK) set(CMAKE_CXX_CPPCHECK "cppcheck") list(APPEND CMAKE_CXX_CPPCHECK @@ -150,7 +178,16 @@ configure_file(include/gtsam_points/config.hpp.in include/gtsam_points/config.hp ########### -add_library(gtsam_points SHARED +if(WIN32 AND BUILD_WITH_HIP) + # On Windows with a static GTSAM (GTSAM_EXPORT = empty, no dllimport), both + # gtsam_points and gtsam_points_cuda are built as STATIC to avoid circular DLL + # import issues (PointCloudGPU inherits PointCloudCPU across library boundary). + # Test executables link all three static libs directly. + set(_gtsam_points_type STATIC) +else() + set(_gtsam_points_type SHARED) +endif() +add_library(gtsam_points ${_gtsam_points_type} # util src/gtsam_points/util/parallelism.cpp src/gtsam_points/util/bspline.cpp @@ -227,14 +264,37 @@ target_link_libraries(gtsam_points $ $ ) -set_target_properties(gtsam_points PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION 1 -) +if(NOT WIN32 OR NOT BUILD_WITH_HIP) + set_target_properties(gtsam_points PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION 1 + ) +endif() + +if(BUILD_WITH_HIP) + # The public API uses CUstream_st* etc. The main library's host .cpp call into + # the GPU library across that type, so they must see the same HIP opaque-type + # bridge (CUstream_st -> ihipStream_t) for name mangling to match. Force- + # include the lightweight type bridge (forward decls only, no HIP runtime). + target_compile_definitions(gtsam_points PRIVATE USE_HIP) + target_compile_options(gtsam_points PRIVATE + -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h + ) +endif() # GPU-related -if(BUILD_WITH_CUDA) - add_library(gtsam_points_cuda SHARED +if(BUILD_WITH_CUDA OR BUILD_WITH_HIP) + # On Windows the GPU lib and the main lib have a mutual symbol dependency + # (PointCloudGPU inherits PointCloudCPU across the library boundary). + # Build as STATIC on Windows/HIP so the GPU objects are absorbed into + # gtsam_points.dll at final link, avoiding the circular DLL import. + # On Linux/CUDA the existing SHARED build is preserved. + if(WIN32 AND BUILD_WITH_HIP) + set(_gtsam_points_cuda_type STATIC) + else() + set(_gtsam_points_cuda_type SHARED) + endif() + add_library(gtsam_points_cuda ${_gtsam_points_cuda_type} # cuda-related src/gtsam_points/cuda/check_error.cu src/gtsam_points/cuda/check_error_cusolver.cu @@ -269,8 +329,61 @@ if(BUILD_WITH_CUDA) $ $ ) + + if(BUILD_WITH_HIP) + # Compile the .cu sources as HIP. The .cpp sources here are host-only + # orchestration (streams + async alloc, no device kernels), so they stay + # CXX; both languages get the compat header force-included and the + # CUDA-toolkit-named forwarding shims on a PRIVATE+BEFORE include path. + set(gtsam_points_cuda_hip_sources + src/gtsam_points/cuda/check_error.cu + src/gtsam_points/cuda/check_error_cusolver.cu + src/gtsam_points/cuda/check_error_curand.cu + src/gtsam_points/cuda/cuda_memory.cu + src/gtsam_points/cuda/cuda_stream.cu + src/gtsam_points/cuda/cuda_buffer.cu + src/gtsam_points/cuda/cuda_device_sync.cu + src/gtsam_points/cuda/cuda_device_prop.cu + src/gtsam_points/cuda/cuda_graph.cu + src/gtsam_points/cuda/cuda_graph_exec.cu + src/gtsam_points/cuda/stream_roundrobin.cu + src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu + src/gtsam_points/types/point_cloud.cu + src/gtsam_points/types/point_cloud_gpu.cu + src/gtsam_points/types/gaussian_voxelmap_gpu.cu + src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu + src/gtsam_points/factors/integrated_vgicp_derivatives.cu + src/gtsam_points/factors/integrated_vgicp_derivatives_inliers.cu + src/gtsam_points/factors/integrated_vgicp_derivatives_compute.cu + src/gtsam_points/factors/integrated_vgicp_derivatives_linearize.cu + src/gtsam_points/util/easy_profiler_cuda.cu + ) + set_source_files_properties(${gtsam_points_cuda_hip_sources} PROPERTIES LANGUAGE HIP) + set_target_properties(gtsam_points_cuda PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") + + # PRIVATE+BEFORE so /// + # / resolve to the forwarding shims on the HIP build + # only (the dir is absent on the CUDA build, so the real toolkit wins). + target_include_directories(gtsam_points_cuda BEFORE PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat + ) + # Force-include the compat header into every TU (CXX and HIP) so sources + # keep their CUDA spelling and the CUstream_st/graph type bridge is in scope. + target_compile_options(gtsam_points_cuda PRIVATE + -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip.h + ) + target_compile_definitions(gtsam_points_cuda PRIVATE USE_HIP) + endif() + + if(BUILD_WITH_CUDA) + target_link_libraries(gtsam_points_cuda CUDA::cudart) + endif() + if(BUILD_WITH_HIP) + # hip::host (runtime) only -- NOT hip::device, whose -x hip/--offload-arch in + # INTERFACE_COMPILE_OPTIONS would propagate to the host .cpp in this target. + target_link_libraries(gtsam_points_cuda hip::host) + endif() target_link_libraries(gtsam_points_cuda - CUDA::cudart Boost::boost Eigen3::Eigen gtsam @@ -278,10 +391,12 @@ if(BUILD_WITH_CUDA) $ $ ) - set_target_properties(gtsam_points_cuda PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION 1 - ) + if(NOT WIN32 OR NOT BUILD_WITH_HIP) + set_target_properties(gtsam_points_cuda PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION 1 + ) + endif() target_link_libraries(gtsam_points gtsam_points_cuda @@ -348,7 +463,30 @@ if(BUILD_TESTS) add_executable(${test_name} ${test_src}) target_link_libraries(${test_name} gtsam_points gtest_main) target_include_directories(${test_name} PRIVATE ${Boost_INCLUDE_DIRS} src/test/include) - gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") + if(BUILD_WITH_HIP) + # Tests call the GPU API across CUstream_st*; force-include the HIP opaque- + # type bridge so those calls match the gtsam_points_cuda definitions. + target_compile_definitions(${test_name} PRIVATE USE_HIP) + target_compile_options(${test_name} PRIVATE + -include ${CMAKE_CURRENT_SOURCE_DIR}/include/gtsam_points/cuda/cuda_to_hip_types.h + ) + endif() + if(WIN32 AND BUILD_WITH_HIP) + # The mixed MSVC/clang toolchain causes duplicate COMDAT symbols when + # clang-compiled gtsam_points.lib and MSVC-compiled gtsam.dll both define + # the same STL inline/template symbols. Use /FORCE:MULTIPLE to let lld-link + # pick one definition and proceed (all definitions are functionally identical). + target_link_options(${test_name} PRIVATE "LINKER:/FORCE:MULTIPLE") + endif() + if(WIN32 AND BUILD_WITH_HIP) + # Defer test discovery to run time: build-time discovery requires the exe + # to load successfully, but gtsam.dll and ROCm DLLs are not in the exe dir + # until the DLL-copy step runs at ctest time. + gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + DISCOVERY_MODE PRE_TEST) + else() + gtest_discover_tests(${test_name} WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") + endif() endforeach() if(BUILD_TESTS_PCL) @@ -379,7 +517,7 @@ install( ) list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points) -if(BUILD_WITH_CUDA) +if(BUILD_WITH_CUDA OR BUILD_WITH_HIP) list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points_cuda) endif() diff --git a/README.md b/README.md index 746f8cf1..82e508bb 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Tested on Ubuntu 22.04 / 24.04 and CUDA 12.2 / 12.6 / 13.1, and NVIDIA Jetson Or GICP with voxel-based data association and multi-distribution-correspondence [[3]](#VGICP1)[[4]](#VGICP2). - **IntegratedVGICPFactorGPU** GPU implementation of VGICP [[3]](#VGICP1)[[4]](#VGICP2). - To enable this factor, set ```-DBUILD_WITH_CUDA=ON```. + To enable this factor, set ```-DBUILD_WITH_CUDA=ON``` (NVIDIA) or ```-DBUILD_WITH_HIP=ON``` (AMD ROCm). - **IntegratedLOAMFactor** Matching cost factor based on the combination of point-to-plane and point-to-edge distances [[5]](#LOAM)[[6]](#LEGO). @@ -131,9 +131,11 @@ cmake .. -DCMAKE_BUILD_TYPE=Release # -DBUILD_TOOLS=OFF \ # Set ON to build tools # -DBUILD_WITH_TBB=OFF \ # Set ON to enable TBB # -DBUILD_WITH_OPENMP=OFF \ # Set ON to enable OpenMP -# -DBUILD_WITH_CUDA=OFF \ # Set ON to enable CUDA support +# -DBUILD_WITH_CUDA=OFF \ # Set ON to enable CUDA support (NVIDIA) # -DBUILD_WITH_CUDA_MULTIARCH=OFF \ # Set ON to enable multi-arch CUDA support # -DCMAKE_CUDA_ARCHITECTURES=89 \ # If not specified, "native" architecture is used +# -DBUILD_WITH_HIP=OFF \ # Set ON to enable ROCm/HIP support (AMD); requires ROCm +# -DCMAKE_HIP_ARCHITECTURES=gfx90a \# Target AMD GPU arch (e.g. gfx90a, gfx1100); defaults to gfx90a if unset # -DBUILD_WITH_MARCH_NATIVE=OFF # Set ON to enable -march=native (recommended to keep it OFF) make -j$(nproc) @@ -208,7 +210,7 @@ This library is released under the MIT license. - [GTSAM](https://gtsam.org/) - [optional] [PCL]([https://www.openmp.org/](https://pointclouds.org/)) - [optional] [OpenMP](https://www.openmp.org/) -- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit) +- [optional] [CUDA](https://developer.nvidia.com/cuda-toolkit) (NVIDIA GPU support) or [ROCm](https://rocm.docs.amd.com/) (AMD GPU support) - [optional] [iridescence](https://github.com/koide3/iridescence) diff --git a/cmake/gtsam_points-config.cmake.in b/cmake/gtsam_points-config.cmake.in index 05486054..72b12e60 100644 --- a/cmake/gtsam_points-config.cmake.in +++ b/cmake/gtsam_points-config.cmake.in @@ -12,6 +12,7 @@ include_guard() set(GTSAM_POINTS_USE_TBB @GTSAM_POINTS_USE_TBB@) set(GTSAM_POINTS_USE_OPENMP @GTSAM_POINTS_USE_OPENMP@) set(GTSAM_POINTS_USE_CUDA @GTSAM_POINTS_USE_CUDA@) +set(GTSAM_POINTS_USE_HIP @GTSAM_POINTS_USE_HIP@) get_filename_component(gtsam_points_CURRENT_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${gtsam_points_CURRENT_CONFIG_DIR}") @@ -27,7 +28,9 @@ if(GTSAM_POINTS_USE_TBB) find_dependency(TBB REQUIRED) endif() -if(GTSAM_POINTS_USE_CUDA) +if(GTSAM_POINTS_USE_HIP) + find_dependency(hip REQUIRED) +elseif(GTSAM_POINTS_USE_CUDA) find_dependency(CUDAToolkit REQUIRED) endif() diff --git a/hip_compat/cub/device/device_reduce.cuh b/hip_compat/cub/device/device_reduce.cuh new file mode 100644 index 00000000..319e87c1 --- /dev/null +++ b/hip_compat/cub/device/device_reduce.cuh @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: maps the CUB include path to hipCUB. The compat header's +// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the +// include path to its equivalent. HIP include path +// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins. +#pragma once +#include diff --git a/hip_compat/cub/device/device_select.cuh b/hip_compat/cub/device/device_select.cuh new file mode 100644 index 00000000..66d30497 --- /dev/null +++ b/hip_compat/cub/device/device_select.cuh @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: maps the CUB include path to hipCUB. The compat header's +// `#define cub hipcub` rewrites the cub:: namespace in code; this maps the +// include path to its equivalent. HIP include path +// (PRIVATE+BEFORE) only; on NVIDIA this directory is absent so real CUB wins. +#pragma once +#include diff --git a/hip_compat/cuda.h b/hip_compat/cuda.h new file mode 100644 index 00000000..8f6586d9 --- /dev/null +++ b/hip_compat/cuda.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points +// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on +// NVIDIA this directory is absent so the real toolkit header is used. +#pragma once +#include diff --git a/hip_compat/cuda_runtime.h b/hip_compat/cuda_runtime.h new file mode 100644 index 00000000..8f6586d9 --- /dev/null +++ b/hip_compat/cuda_runtime.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points +// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on +// NVIDIA this directory is absent so the real toolkit header is used. +#pragma once +#include diff --git a/hip_compat/cuda_runtime_api.h b/hip_compat/cuda_runtime_api.h new file mode 100644 index 00000000..8f6586d9 --- /dev/null +++ b/hip_compat/cuda_runtime_api.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points +// CUDA-to-HIP compat header. On the HIP include path (PRIVATE+BEFORE) only; on +// NVIDIA this directory is absent so the real toolkit header is used. +#pragma once +#include diff --git a/hip_compat/curand.h b/hip_compat/curand.h new file mode 100644 index 00000000..28b02810 --- /dev/null +++ b/hip_compat/curand.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: check_error_curand.cu includes only to name the +// CURAND_STATUS_* enum values for its diagnostic switch (the GPU library never +// calls curand). Forward to the compat header, which maps those names to the +// hipRAND HIPRAND_STATUS_* values. HIP include path (PRIVATE+BEFORE) only. +#pragma once +#include diff --git a/hip_compat/cusparse.h b/hip_compat/cusparse.h new file mode 100644 index 00000000..835ed7b8 --- /dev/null +++ b/hip_compat/cusparse.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: check_error_cusolver.cu includes but references +// no cusparse symbol (its cuSOLVER status switch uses integer literals only), +// so this just satisfies the include. HIP include path (PRIVATE+BEFORE) only. +#pragma once +#include diff --git a/hip_compat/device_atomic_functions.h b/hip_compat/device_atomic_functions.h new file mode 100644 index 00000000..89ebb0e8 --- /dev/null +++ b/hip_compat/device_atomic_functions.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// HIP build shim: forwards the CUDA-toolkit header name to the gtsam_points +// CUDA-to-HIP compat header (HIP provides atomicCAS/atomicAdd/atomicMax via the +// HIP runtime). On the HIP include path (PRIVATE+BEFORE) only; on NVIDIA this +// directory is absent so the real toolkit header is used. +#pragma once +#include diff --git a/include/gtsam_points/cuda/cuda_to_hip.h b/include/gtsam_points/cuda/cuda_to_hip.h new file mode 100644 index 00000000..d3d5ad2a --- /dev/null +++ b/include/gtsam_points/cuda/cuda_to_hip.h @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2021 Kenji Koide (k.koide@aist.go.jp) +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// +// ROCm/HIP compatibility shim for the gtsam_points GPU library. +// +// This is the only file that knows about HIP. On AMD (USE_HIP) it pulls in the +// HIP runtime and aliases the CUDA spellings the GPU library actually uses to +// their HIP equivalents; on NVIDIA it is a plain include of the CUDA runtime. +// It is force-included into every gtsam_points_cuda translation unit by the +// build (so source files keep their CUDA spelling and the diff stays small). +// +// Authored with the assistance of Claude (Anthropic). + +#pragma once + +#if defined(USE_HIP) + +// libc host string/memory decls must win over HIP's __device__ overloads of +// memcpy/memset, which become visible once hip_runtime.h is in scope inside a +// translation unit compiled as HIP (point_cloud_gpu.cu uses host std::memcpy). +#include +#include + +#include + +// The GPU library uses cudaMallocAsync/cudaFreeAsync (stream-ordered) and the +// CUDA-12 graph-instantiate / 4-arg add-dependencies arities. CUDA_VERSION is +// undefined on HIP, which would (a) trip cuda_malloc_async.hpp's +// `#if (CUDA_VERSION < 11000)` and silently downgrade every async alloc to a +// synchronous hipMalloc, and (b) leave the cuda_graph.cu arity branch on the +// wrong side. Pin it into [11000, 13000): hipMallocAsync stays the chosen path +// and the 4-arg hipGraphAddDependencies / 5-arg hipGraphInstantiate signatures +// (ROCm 7.x) are selected. +#ifndef CUDA_VERSION +#define CUDA_VERSION 12000 +#endif + +// Opaque API-type bridge (CUstream_st/CUgraph_st/... -> HIP opaque structs). +// Kept in a separate header so it is also force-included into the main library, +// keeping a CUstream_st* signature name-mangling compatible across both libs. +#include + +// hipCUB provides the cub:: device primitives under the hipcub:: namespace. +#define cub hipcub + +// Error handling / device query +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaGetErrorName hipGetErrorName +#define cudaGetErrorString hipGetErrorString +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaMemGetInfo hipMemGetInfo + +// Memory (stream-ordered async allocation is used pervasively) +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMallocAsync hipMallocAsync +#define cudaFreeAsync hipFreeAsync +#define cudaMallocHost hipHostMalloc +#define cudaFreeHost hipHostFree +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostRegisterDefault hipHostRegisterDefault +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemcpyHostToHost hipMemcpyHostToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice + +// Streams +#define cudaStream_t hipStream_t +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamBeginCapture hipStreamBeginCapture +#define cudaStreamEndCapture hipStreamEndCapture +#define cudaStreamCaptureModeGlobal hipStreamCaptureModeGlobal + +// Events (easy_profiler timing) +#define cudaEvent_t hipEvent_t +#define cudaEventCreate hipEventCreate +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventDestroy hipEventDestroy + +// CUDA Graphs (compile parity; the VGICP gate drives streams + hipCUB directly) +#define cudaGraphCreate hipGraphCreate +#define cudaGraphDestroy hipGraphDestroy +#define cudaGraphAddDependencies hipGraphAddDependencies +#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode +#define cudaGraphInstantiate hipGraphInstantiate +#define cudaGraphExecDestroy hipGraphExecDestroy +#define cudaGraphLaunch hipGraphLaunch + +// rocThrust exposes its device backend execution policies under thrust::hip +// (thrust::cuda::par[_nosync] is the CUDA backend and is not valid here). +// Pull the HIP execution-policy header so thrust::hip is defined, then alias +// the namespace so the ~12 thrust::cuda::par[_nosync].on(stream) call sites +// resolve to the HIP backend unchanged. This header is force-included at the +// top of every TU, so the alias must not depend on the source's own includes. +// Guard on __HIPCC__: only the .cu sources are compiled -x hip (and use the +// thrust policy); the host .cpp in the GPU lib are compiled by the CXX compiler +// (which cannot parse rocThrust/rocPRIM) and never use thrust::cuda::par. +#if defined(__HIPCC__) +#include +namespace thrust { +namespace cuda = hip; +} +#endif + +// curand error-name strings (check_error_curand.cu): hipRAND ships the same +// status names under the HIPRAND_STATUS_* spelling. The GPU library never calls +// curand at runtime (only this diagnostic switch), so no hipRAND link is needed. +#include +#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS +#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH +#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED +#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED +#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR +#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE +#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE +#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED +#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE +#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE +#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED +#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH +#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR + +#else + +#include + +#endif diff --git a/include/gtsam_points/cuda/cuda_to_hip_types.h b/include/gtsam_points/cuda/cuda_to_hip_types.h new file mode 100644 index 00000000..76dc4f7f --- /dev/null +++ b/include/gtsam_points/cuda/cuda_to_hip_types.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2021 Kenji Koide (k.koide@aist.go.jp) +// Copyright (c) 2026 Advanced Micro Devices, Inc. (Jeff Daily ) +// +// ROCm/HIP opaque-type bridge for the gtsam_points public API. +// +// The public headers use the CUDA driver opaque struct pointers (CUstream_st*, +// CUgraph_st*, ...) as their stream/graph/event types. On HIP those types are +// the ihip*/hipGraph* opaque structs. This header (force-included into BOTH the +// main gtsam_points library and the gtsam_points_cuda GPU library under USE_HIP) +// aliases the CUDA names to the matching HIP opaque structs so a CUstream_st* +// signature is identical to hipStream_t in every translation unit, keeping the +// host callers and the .cu definitions name-mangling compatible. +// +// It only forward-declares the HIP opaque structs (a pointer type needs no +// complete type), so it adds no HIP runtime dependency to host C++ files. The +// full cuda* runtime symbol aliases live in cuda_to_hip.h (GPU library only). +// +// Authored with the assistance of Claude (Anthropic). + +#pragma once + +#if defined(USE_HIP) + +struct ihipStream_t; +struct ihipEvent_t; +struct ihipGraph; +struct hipGraphNode; +struct hipGraphExec; + +#define CUstream_st ihipStream_t +#define CUevent_st ihipEvent_t +#define CUgraph_st ihipGraph +#define CUgraphNode_st hipGraphNode +#define CUgraphExec_st hipGraphExec + +#endif diff --git a/include/gtsam_points/types/point_cloud_cpu.hpp b/include/gtsam_points/types/point_cloud_cpu.hpp index cd03ca85..a4ad1f8d 100644 --- a/include/gtsam_points/types/point_cloud_cpu.hpp +++ b/include/gtsam_points/types/point_cloud_cpu.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include diff --git a/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp b/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp index f234be91..f861e626 100644 --- a/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp +++ b/src/gtsam_points/types/gaussian_voxelmap_cpu.cpp @@ -82,20 +82,20 @@ void GaussianVoxelMapCPU::save_compact(const std::string& path) const { return GaussianVoxelData(voxel->first.coord, voxel->second); }); - std::ofstream ofs(path); - ofs << "compact " << 1 << std::endl; - ofs << "resolution " << voxel_resolution() << std::endl; - ofs << "lru_count " << lru_counter << std::endl; - ofs << "lru_cycle " << lru_clear_cycle << std::endl; - ofs << "lru_thresh " << lru_horizon << std::endl; - ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << std::endl; - ofs << "num_voxels " << serial_voxels.size() << std::endl; + std::ofstream ofs(path, std::ios::binary); + ofs << "compact " << 1 << "\n"; + ofs << "resolution " << voxel_resolution() << "\n"; + ofs << "lru_count " << lru_counter << "\n"; + ofs << "lru_cycle " << lru_clear_cycle << "\n"; + ofs << "lru_thresh " << lru_horizon << "\n"; + ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << "\n"; + ofs << "num_voxels " << serial_voxels.size() << "\n"; ofs.write(reinterpret_cast(serial_voxels.data()), sizeof(GaussianVoxelData) * serial_voxels.size()); } GaussianVoxelMapCPU::Ptr GaussianVoxelMapCPU::load(const std::string& path) { - std::ifstream ifs(path); + std::ifstream ifs(path, std::ios::binary); if (!ifs) { std::cerr << "error: failed to open " << path << std::endl; return nullptr; diff --git a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp index 8b6c231d..2c3e9950 100644 --- a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp +++ b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/src/gtsam_points/types/gaussian_voxelmap_gpu.cu b/src/gtsam_points/types/gaussian_voxelmap_gpu.cu index 69e5b978..8244790b 100644 --- a/src/gtsam_points/types/gaussian_voxelmap_gpu.cu +++ b/src/gtsam_points/types/gaussian_voxelmap_gpu.cu @@ -321,6 +321,7 @@ void GaussianVoxelMapGPU::save_compact(const std::string& path) const { check_error << cudaMemcpyAsync(h_voxel_means.data(), voxel_means, sizeof(Eigen::Vector3f) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0); check_error << cudaMemcpyAsync(h_voxel_covs.data(), voxel_covs, sizeof(Eigen::Matrix3f) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0); check_error << cudaMemcpyAsync(h_voxel_intensities.data(), voxel_intensities, sizeof(float) * voxelmap_info.num_voxels, cudaMemcpyDeviceToHost, 0); + check_error << cudaStreamSynchronize(0); std::vector serial_voxels; serial_voxels.reserve(voxelmap_info.num_voxels); @@ -357,20 +358,20 @@ void GaussianVoxelMapGPU::save_compact(const std::string& path) const { serial_voxels.emplace_back(h_voxel_coords[i], voxel); } - std::ofstream ofs(path); - ofs << "compact " << 1 << std::endl; - ofs << "resolution " << voxel_resolution() << std::endl; - ofs << "lru_count " << 0 << std::endl; - ofs << "lru_cycle " << 1 << std::endl; - ofs << "lru_thresh " << 1 << std::endl; - ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << std::endl; - ofs << "num_voxels " << serial_voxels.size() << std::endl; + std::ofstream ofs(path, std::ios::binary); + ofs << "compact " << 1 << "\n"; + ofs << "resolution " << voxel_resolution() << "\n"; + ofs << "lru_count " << 0 << "\n"; + ofs << "lru_cycle " << 1 << "\n"; + ofs << "lru_thresh " << 1 << "\n"; + ofs << "voxel_bytes " << sizeof(GaussianVoxelData) << "\n"; + ofs << "num_voxels " << serial_voxels.size() << "\n"; ofs.write(reinterpret_cast(serial_voxels.data()), sizeof(GaussianVoxelData) * serial_voxels.size()); } GaussianVoxelMapGPU::Ptr GaussianVoxelMapGPU::load(const std::string& path) { - std::ifstream ifs(path); + std::ifstream ifs(path, std::ios::binary); if (!ifs) { std::cerr << "error: failed to open " << path << std::endl; return nullptr; @@ -463,6 +464,8 @@ GaussianVoxelMapGPU::Ptr GaussianVoxelMapGPU::load(const std::string& path) { check_error << cudaMemcpyAsync(voxelmap->voxel_means, h_voxel_means.data(), sizeof(Eigen::Vector3f) * num_voxels, cudaMemcpyHostToDevice, 0); check_error << cudaMemcpyAsync(voxelmap->voxel_covs, h_voxel_covs.data(), sizeof(Eigen::Matrix3f) * num_voxels, cudaMemcpyHostToDevice, 0); check_error << cudaMemcpyAsync(voxelmap->voxel_intensities, h_voxel_intensities.data(), sizeof(float) * num_voxels, cudaMemcpyHostToDevice, 0); + // Ensure all H2D copies complete before the source host buffers go out of scope. + check_error << cudaStreamSynchronize(0); return voxelmap; } diff --git a/src/gtsam_points/types/point_cloud_cpu.cpp b/src/gtsam_points/types/point_cloud_cpu.cpp index 30739753..fb5f4952 100644 --- a/src/gtsam_points/types/point_cloud_cpu.cpp +++ b/src/gtsam_points/types/point_cloud_cpu.cpp @@ -267,8 +267,9 @@ PointCloudCPU::Ptr PointCloudCPU::load(const std::string& path) { boost::filesystem::directory_iterator end; const std::regex aux_name_regex("/aux_([^_]+).bin"); for (; itr != end; itr++) { + const std::string itr_path_str = itr->path().generic_string(); std::smatch matched; - if (!std::regex_search(itr->path().string(), matched, aux_name_regex)) { + if (!std::regex_search(itr_path_str, matched, aux_name_regex)) { continue; } const std::string name = matched.str(1);