diff --git a/README.md b/README.md index 48899146910..64dd0118903 100644 --- a/README.md +++ b/README.md @@ -191,4 +191,49 @@ script/uninstall_precommit.sh ``` If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the -`git commit` command. +git commit` command. + +## Modification of this fork + +This fork introduces an implementation to improve the performnace on mi100 using some technique adapted on the mi300, especially on vector L1 cache hit rate. During the development, we also explore some parameters combination to improve the overall performance without the code modification. + +### How to build + +```bash +mkdir build +cd build +make example_splitK_gemm_xdl_fp16 +``` + +### Baseline + +```bash +bin/example_splitK_gemm_xdl_fp16 1 2 1 3840 4096 4096 4096 4096 4096 +``` + +You can experiment the splitK algorithm with different kbatch value. For example, splitk_factor = 1 means the splitK algorithm runs with kbatch = 1. + +### Profile + +1. Execution metrics tracking via omniperf +Use the following cmd to load the omniperf first. + +```bash +module load omniperf +module load rocm/5.7.1 +``` + +Then we would try to add the following cmd to the script file called submit_jobs.sh and execute them one by one. + +```bash +omniperf profile -n bin -- ./example_splitK_gemm_xdl_fp16 1 2 1 8 3840 4096 4096 4096 4096 4096 +omniperf analyze -p workloads/bin/mi100/ &> analyze.txt +``` + +2. Hardware metrics tracking via rocprof input file + +View the result from profiling using the following cmd. + +```bash +vim analyze.txt +``` diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp index 84b00fcbd69..86b22a98166 100644 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp @@ -957,6 +957,68 @@ struct BlockToCTileMap_3DGrid_KSplit } }; +template +struct BlockToCTileMap_3DGrid_KSplit1 +{ + + __host__ __device__ BlockToCTileMap_3DGrid_KSplit1() = default; + + __host__ __device__ constexpr auto + CalculateGridSize(index_t M, index_t N, index_t k_split) const + { + // Create 3D grid + const auto M0 = math::integer_divide_ceil(M, MPerBlock); + const auto N0 = math::integer_divide_ceil(N, NPerBlock); + + return std::make_tuple(N0, M0, k_split); + } + + template + __device__ constexpr auto CalculateBottomIndex(const TopIdx&) const + { + constexpr index_t GroupNum = 8; + auto block_1d_id = blockIdx.x; + + const auto M0 = math::integer_divide_ceil(block_1d_id, MPerBlock); + const auto N0 = math::integer_divide_ceil(block_1d_id, NPerBlock); + + const auto group_size = math::integer_divide_ceil(M0 * N0, GroupNum); + const auto big_group_num = GroupNum - (group_size * GroupNum - M0 * N0); + auto group_id_x = block_1d_id % GroupNum; + auto group_id_y = block_1d_id / GroupNum; + auto remap_block_1d_id = + group_id_x <= big_group_num + ? group_id_x * group_size + group_id_y + : group_id_x * group_size + big_group_num - group_id_x + group_id_y; + + index_t idx_N0 = remap_block_1d_id % N0; + index_t idx_M0 = remap_block_1d_id / N0; + + constexpr index_t M01_ = 8; + const auto M01_adapt = (idx_M0 < static_cast(M0 - M0 % M01_)) ? M01_ : M0 % M01_; + + index_t idx_M00 = idx_M0 / M01_; + index_t idx_M01 = idx_M0 % M01_; + index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0; + + // return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x); + return make_tuple(blockIdx.z, idx_N0_M01_local % M01_adapt + idx_M00 * M01_, idx_N0_M01_local / M01_adapt); + } + + template + __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */, + const CTileDim& /* c_tile_dim */) const + { + return true; // always valid provided that user gets grid size from CalculateGridSize() + } + + template + __host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const + { + return true; + } +}; + enum StreamKReductionStrategy { Atomic = 0, // sk block use atomic to do reduction diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp index 6ee279a3f1c..7acc4b3ba06 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp @@ -655,7 +655,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 // return block_id to C matrix tile idx (m0, n0, k_split) mapping __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap() { - return BlockToCTileMap_3DGrid_KSplit(); + return BlockToCTileMap_3DGrid_KSplit1(); } using CGridDesc_M_N = remove_cvref_t; diff --git a/results/analyse_k1.txt b/results/analyse_k1.txt new file mode 100644 index 00000000000..0c8a328ecd6 --- /dev/null +++ b/results/analyse_k1.txt @@ -0,0 +1,967 @@ + +-------- +Analyze +-------- + + +-------------------------------------------------------------------------------- +0. Top Stat +╒════╤══════════════════════════════════════════╤═════════╤══════════════╤════════════╤══════════════╤═══════╕ +│ │ KernelName │ Count │ Sum(ns) │ Mean(ns) │ Median(ns) │ Pct │ +╞════╪══════════════════════════════════════════╪═════════╪══════════════╪════════════╪══════════════╪═══════╡ +│ 0 │ void kernel_gemm_xdlops_v2r4r2_simplifie │ 56.00 │ 188055819.00 │ 3358139.62 │ 3365505.50 │ 99.98 │ +│ │ d analyse_k1.txt