diff --git a/README.md b/README.md
index 48899146910..64dd0118903 100644
--- a/README.md
+++ b/README.md
@@ -191,4 +191,49 @@ script/uninstall_precommit.sh
 ```
 
 If you need to temporarily disable pre-commit hooks, you can add the `--no-verify` option to the
-`git commit` command.
+git commit` command.
+
+## Modification of this fork
+
+This fork introduces an implementation to improve the performnace on mi100 using some technique adapted on the mi300, especially on vector L1 cache hit rate. During the development, we also explore some parameters combination to improve the overall performance without the code modification.
+
+### How to build 
+
+```bash
+mkdir build
+cd build
+make example_splitK_gemm_xdl_fp16
+```
+
+### Baseline
+
+```bash
+bin/example_splitK_gemm_xdl_fp16 1 2 1 <splitk_factor.> 3840 4096 4096 4096 4096 4096
+```
+
+You can experiment the splitK algorithm with different kbatch value. For example, splitk_factor = 1 means the splitK algorithm runs with kbatch = 1.  
+
+### Profile
+
+1. Execution metrics tracking via omniperf
+Use the following cmd to load the omniperf first.
+
+```bash
+module load omniperf
+module load rocm/5.7.1
+```
+
+Then we would try to add the following cmd to the script file called submit_jobs.sh and execute them one by one.
+
+```bash
+omniperf profile -n bin -- ./example_splitK_gemm_xdl_fp16 1 2 1 8 3840 4096 4096 4096 4096 4096
+omniperf analyze -p workloads/bin/mi100/ &> analyze.txt
+```
+
+2. Hardware metrics tracking via rocprof input file
+
+View the result from profiling using the following cmd.
+
+```bash
+vim analyze.txt
+```
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 84b00fcbd69..86b22a98166 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -957,6 +957,68 @@ struct BlockToCTileMap_3DGrid_KSplit
     }
 };
 
+template <index_t MPerBlock, index_t NPerBlock>
+struct BlockToCTileMap_3DGrid_KSplit1
+{
+
+    __host__ __device__ BlockToCTileMap_3DGrid_KSplit1() = default;
+
+    __host__ __device__ constexpr auto
+    CalculateGridSize(index_t M, index_t N, index_t k_split) const
+    {
+        // Create 3D grid
+        const auto M0 = math::integer_divide_ceil(M, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(N, NPerBlock);
+
+        return std::make_tuple(N0, M0, k_split);
+    }
+
+    template <typename TopIdx>
+    __device__ constexpr auto CalculateBottomIndex(const TopIdx&) const
+    {
+        constexpr index_t GroupNum = 8;
+        auto block_1d_id = blockIdx.x;
+
+        const auto M0 = math::integer_divide_ceil(block_1d_id, MPerBlock);
+        const auto N0 = math::integer_divide_ceil(block_1d_id, NPerBlock);
+
+        const auto group_size    = math::integer_divide_ceil(M0 * N0, GroupNum);
+        const auto big_group_num = GroupNum - (group_size * GroupNum - M0 * N0);
+        auto group_id_x          = block_1d_id % GroupNum;
+        auto group_id_y          = block_1d_id / GroupNum;
+        auto remap_block_1d_id =
+            group_id_x <= big_group_num
+                ? group_id_x * group_size + group_id_y
+                : group_id_x * group_size + big_group_num - group_id_x + group_id_y;
+
+        index_t idx_N0 = remap_block_1d_id % N0;
+        index_t idx_M0 = remap_block_1d_id / N0;
+
+        constexpr index_t M01_ = 8;
+        const auto M01_adapt = (idx_M0 < static_cast<index_t>(M0 - M0 % M01_)) ? M01_ : M0 % M01_;
+
+        index_t idx_M00          = idx_M0 / M01_;
+        index_t idx_M01          = idx_M0 % M01_;
+        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+
+        // return make_tuple(blockIdx.z, blockIdx.y, blockIdx.x);
+        return make_tuple(blockIdx.z, idx_N0_M01_local % M01_adapt + idx_M00 * M01_, idx_N0_M01_local / M01_adapt);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const
+    {
+        return true;
+    }
+};
+
 enum StreamKReductionStrategy
 {
     Atomic = 0, // sk block use atomic to do reduction
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
index 6ee279a3f1c..7acc4b3ba06 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -655,7 +655,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
     // return block_id to C matrix tile idx (m0, n0, k_split) mapping
     __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap()
     {
-        return BlockToCTileMap_3DGrid_KSplit<MPerBlock, NPerBlock>();
+        return BlockToCTileMap_3DGrid_KSplit1<MPerBlock, NPerBlock>();
     }
 
     using CGridDesc_M_N         = remove_cvref_t<decltype(MakeCGridDescriptor_M_N(1, 1, 1))>;
diff --git a/results/analyse_k1.txt b/results/analyse_k1.txt
new file mode 100644
index 00000000000..0c8a328ecd6
--- /dev/null
+++ b/results/analyse_k1.txt
@@ -0,0 +1,967 @@
+
+--------
+Analyze
+--------
+
+
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤══════════════╤════════════╤══════════════╤═══════╕
+│    │ KernelName                               │   Count │      Sum(ns) │   Mean(ns) │   Median(ns) │   Pct │
+╞════╪══════════════════════════════════════════╪═════════╪══════════════╪════════════╪══════════════╪═══════╡
+│  0 │ void kernel_gemm_xdlops_v2r4r2_simplifie │   56.00 │ 188055819.00 │ 3358139.62 │   3365505.50 │ 99.98 │
+│    │ d<GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v │         │              │            │              │       │
+│    │ 2r4r2, true, (InMemoryDataOperationEn... │         │              │            │              │       │
+├────┼──────────────────────────────────────────┼─────────┼──────────────┼────────────┼──────────────┼───────┤
+│  1 │ __amd_rocclr_fillBufferAligned.kd        │    1.00 │     40160.00 │   40160.00 │     40160.00 │  0.02 │
+╘════╧══════════════════════════════════════════╧═════════╧══════════════╧════════════╧══════════════╧═══════╛
+
+
+--------------------------------------------------------------------------------
+1. System Info
+╒══════════════════╤══════════════════════════════════════════════════════════════════════╕
+│                  │ Info                                                                 │
+╞══════════════════╪══════════════════════════════════════════════════════════════════════╡
+│ workload_name    │ bin                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ command          │ ./example_splitK_gemm_xdl_fp16 1 2 1 1 3840 4096 4096 4096 4096 4096 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_name        │ t004-003.hpcfund                                                     │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_cpu         │ AMD EPYC 7V13 64-Core Processor                                      │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_distro      │ Rocky Linux 9.1 (Blue Onyx)                                          │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_kernel      │ 5.14.0-162.18.1.el9_1.x86_64                                         │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_rocmver     │ 5.7.1-98                                                             │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ date             │ Wed May 29 18:47:27 2024 (CDT)                                       │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ gpu_soc          │ gfx908                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSE            │ 8                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numCU            │ 120                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSIMD          │ 4                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ waveSize         │ 64                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWavesPerCU    │ 40                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWorkgroupSize │ 1024                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L1               │ 16                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2               │ 8192                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ sclk             │ 1502                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ mclk             │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_sclk         │ 300                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_mclk         │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2Banks          │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ LDSBanks         │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ name             │ mi100                                                                │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSQC           │ 48                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ hbmBW            │ 1228.8                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ ip_blocks        │ SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF                                 │
+╘══════════════════╧══════════════════════════════════════════════════════════════════════╛
+
+
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═══════════════════════════╤═════════╤══════════════════╤═══════════╤════════╕
+│ Index   │ Metric                    │ Value   │ Unit             │ Peak      │ PoP    │
+╞═════════╪═══════════════════════════╪═════════╪══════════════════╪═══════════╪════════╡
+│ 2.1.0   │ VALU FLOPs                │         │ Gflops           │ 23070.72  │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.1   │ VALU IOPs                 │         │ Gops             │ 23070.72  │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.2   │ MFMA FLOPs (BF16)         │         │ Gflops           │ 92282.88  │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.3   │ MFMA FLOPs (F16)          │         │ Gflops           │ 184565.76 │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.4   │ MFMA FLOPs (F32)          │         │ Gflops           │ 46141.44  │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.5   │ MFMA FLOPs (F64)          │         │ Gflops           │ 46141.44  │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.6   │ MFMA IOPs (Int8)          │         │ Gops             │ 184565.76 │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.7   │ Active CUs                │ 91.0    │ Cus              │ 120.0     │ 75.83  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.8   │ SALU Util                 │ 0.6     │ Pct              │ 100.0     │ 0.6    │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.9   │ VALU Util                 │ 9.35    │ Pct              │ 100.0     │ 9.35   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.10  │ MFMA Util                 │         │ Pct              │ 100.0     │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.11  │ VALU Active Threads/Wave  │ 64.0    │ Threads          │ 64.0      │ 100.01 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.12  │ IPC - Issue               │ 0.55    │ Instr/cycle      │ 5.0       │ 11.06  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.13  │ LDS BW                    │ 3562.38 │ Gb/sec           │ 23070.72  │ 15.44  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.14  │ LDS Bank Conflict         │ 0.17    │ Conflicts/access │ 32.0      │ 0.52   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.15  │ Instr Cache Hit Rate      │ 100.0   │ Pct              │ 100.0     │ 100.0  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.16  │ Instr Cache BW            │ 299.24  │ Gb/s             │ 4614.14   │ 6.49   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.17  │ Scalar L1D Cache Hit Rate │ 99.49   │ Pct              │ 100.0     │ 99.49  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.18  │ Scalar L1D Cache BW       │ 12.74   │ Gb/s             │ 4614.14   │ 0.28   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.19  │ Vector L1D Cache Hit Rate │ 50.69   │ Pct              │ 100.0     │ 50.69  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.20  │ Vector L1D Cache BW       │ 3572.21 │ Gb/s             │ 11535.36  │ 30.97  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.21  │ L2 Cache Hit Rate         │ 76.63   │ Pct              │ 100.0     │ 76.63  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.22  │ L2-Fabric Read BW         │ 381.74  │ Gb/s             │ 1228.8    │ 31.07  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.23  │ L2-Fabric Write BW        │ 22.94   │ Gb/s             │ 1228.8    │ 1.87   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.24  │ L2-Fabric Read Latency    │ 276.55  │ Cycles           │           │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.25  │ L2-Fabric Write Latency   │ 403.15  │ Cycles           │           │        │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.26  │ Wave Occupancy            │ 965.75  │ Wavefronts       │ 4800.0    │ 20.12  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.27  │ Instr Fetch BW            │ 149.55  │ Gb/s             │ 2307.07   │ 6.48   │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼────────┤
+│ 2.1.28  │ Instr Fetch Latency       │ 16.13   │ Cycles           │           │        │
+╘═════════╧═══════════════════════════╧═════════╧══════════════════╧═══════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+5. Command Processor (CPC/CPF)
+5.1 Command Processor Fetcher
+╒═════════╤════════════════════╤════════════╤══════════╤════════════╤═══════════════╕
+│ Index   │ Metric             │        Avg │      Min │        Max │ Unit          │
+╞═════════╪════════════════════╪════════════╪══════════╪════════════╪═══════════════╡
+│ 5.1.0   │ GPU Busy Cycles    │ 4551806.28 │ 82002.00 │ 4855140.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.1   │ CPF Busy           │ 4551806.33 │ 82002.00 │ 4855143.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.2   │ CPF Util           │     100.00 │   100.00 │     100.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.3   │ CPF Stall          │       0.03 │     0.00 │       0.14 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.4   │ L2Cache Intf Busy  │   13986.93 │  1826.00 │   61255.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.5   │ L2Cache Intf Util  │       0.34 │     0.18 │       2.19 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.6   │ L2Cache Intf Stall │       0.00 │     0.00 │       0.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.7   │ UTCL1 Stall        │    1575.09 │     0.00 │   11021.00 │ Cycles/kernel │
+╘═════════╧════════════════════╧════════════╧══════════╧════════════╧═══════════════╛
+5.2 Command Processor Compute
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit   │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪════════╡
+│ 5.2.0   │ GPU Busy Cycles        │ 4551806.28 │ 82002.00 │ 4855140.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.1   │ CPC Busy Cycles        │ 4551034.39 │ 82002.00 │ 4811142.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.2   │ CPC Util               │      99.84 │    90.85 │     100.00 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.3   │ CPC Stall Cycles       │   70135.02 │ 14010.00 │   79581.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.4   │ CPC Stall Rate         │       1.81 │     1.05 │      17.08 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.5   │ CPC Packet Decoding    │  383706.72 │ 28234.00 │  524728.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.6   │ SPI Intf Busy Cycles   │ 4164585.02 │ 44002.00 │ 4303250.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.7   │ SPI Intf Util          │      90.86 │    53.66 │      93.68 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.8   │ L2Cache Intf Util      │       0.05 │     0.01 │       1.66 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.9   │ UTCL1 Stall Cycles     │    5622.81 │  3118.00 │    6517.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.10  │ UTCL2 Intf Busy Cycles │    6181.75 │  3533.00 │   12009.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.11  │ UTCL2 Intf Util        │       0.22 │     0.08 │       4.79 │ Pct    │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+6. Shader Processor Input (SPI)
+6.1 SPI Stats
+╒═════════╤════════════════════════╤═════════════╤═══════════╤═════════════╤════════════╕
+│ Index   │ Metric                 │         Avg │       Min │         Max │ Unit       │
+╞═════════╪════════════════════════╪═════════════╪═══════════╪═════════════╪════════════╡
+│ 6.1.0   │ GPU Busy               │  4551806.28 │  82002.00 │  4855140.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.1   │ CS Busy                │ 35872816.68 │ 380468.00 │ 37226921.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.2   │ SPI Busy               │  4282313.60 │  48116.00 │  4662910.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.3   │ SQ Busy                │ 35862936.19 │ 370560.00 │ 37207068.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.4   │ Dispatched Workgroups  │     7836.19 │   7680.00 │    15360.00 │ Workgroups │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.5   │ Dispatched Wavefronts  │     8644.61 │   7680.00 │    61440.00 │ Wavefronts │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.6   │ Wave Alloc Failed      │  7824689.79 │  16157.00 │  8364750.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.7   │ Wave Alloc Failed - CS │  3186017.12 │  77597.00 │  3848100.00 │ Cycles     │
+╘═════════╧════════════════════════╧═════════════╧═══════════╧═════════════╧════════════╛
+6.2 SPI Resource Allocation
+╒═════════╤═════════════════════════════╤═════════════╤══════════╤═════════════╤═════════════╕
+│ Index   │ Metric                      │         Avg │      Min │         Max │ Unit        │
+╞═════════╪═════════════════════════════╪═════════════╪══════════╪═════════════╪═════════════╡
+│ 6.2.0   │ Wave request Failed (CS)    │  3186017.12 │ 77597.00 │  3848100.00 │ Cycles      │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.1   │ CS Stall                    │  3552411.86 │     0.00 │  4966225.00 │ Cycles      │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.2   │ CS Stall Rate               │       81.29 │     0.00 │      107.50 │ Pct         │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.3   │ Scratch Stall               │        0.00 │     0.00 │        0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots │        0.00 │     0.00 │        0.00 │ Simd        │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs     │   359193.60 │     0.00 │  1297872.00 │ Simd        │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs     │        0.00 │     0.00 │        0.00 │ Simd        │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.7   │ Insufficient CU LDS         │ 69193410.00 │     0.00 │ 81286185.00 │ Cu          │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.8   │ Insufficient CU Barries     │        0.00 │     0.00 │        0.00 │ Cu          │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.9   │ Insufficient Bulky Resource │        0.00 │     0.00 │        0.00 │ Cu          │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.10  │ Reach CU Threadgroups Limit │        0.00 │     0.00 │        0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.11  │ Reach CU Wave Limit         │        0.00 │     0.00 │        0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.12  │ VGPR Writes                 │        3.99 │     3.62 │        4.00 │ Cycles/wave │
+├─────────┼─────────────────────────────┼─────────────┼──────────┼─────────────┼─────────────┤
+│ 6.2.13  │ SGPR Writes                 │        5.97 │     5.00 │        6.00 │ Cycles/wave │
+╘═════════╧═════════════════════════════╧═════════════╧══════════╧═════════════╧═════════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤═════════════════════╤═══════════╤═══════════╤════════════╤════════════╕
+│ Index   │ Metric              │       Avg │       Min │        Max │ Unit       │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪════════════╪════════════╡
+│ 7.1.0   │ Grid Size           │ 551882.11 │ 491520.00 │ 3932160.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.1   │ Workgroup Size      │     67.37 │     64.00 │     256.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.2   │ Total Wavefronts    │   8644.61 │   7680.00 │   61440.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.3   │ Saved Wavefronts    │      0.00 │      0.00 │       0.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.4   │ Restored Wavefronts │      2.63 │      0.00 │     150.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.5   │ VGPRs               │     47.30 │      8.00 │      48.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.6   │ AGPRs               │     47.30 │      8.00 │      48.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.7   │ SGPRs               │     32.00 │     32.00 │      32.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.8   │ LDS Allocation      │   6539.23 │      0.00 │    6656.00 │ Bytes      │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.9   │ Scratch Allocation  │      0.00 │      0.00 │       0.00 │ Bytes      │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧════════════╧════════════╛
+7.2 Wavefront Runtime Stats
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤═════════════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit            │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪═════════════════╡
+│ 7.2.0   │ Kernel Time (Nanosec)  │ 3299929.46 │ 40160.00 │ 3428147.00 │ Ns              │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.1   │ Kernel Time (Cycles)   │ 4551806.28 │ 82002.00 │ 4855140.00 │ Cycle           │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.2   │ Instr/wavefront        │    9315.69 │    46.00 │    9497.00 │ Instr/wavefront │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.3   │ Wave Cycles            │  567639.86 │  1791.92 │  585247.02 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.4   │ Dependency Wait Cycles │  449768.98 │  1450.05 │  464112.53 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.5   │ Issue Wait Cycles      │   68547.84 │   196.11 │   71877.53 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.6   │ Active Cycles          │   49425.95 │   188.02 │   50391.02 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.7   │ Wavefront Occupancy    │     965.75 │   931.42 │    1295.76 │ Wavefronts      │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.1 Instruction Mix
+╒═════════╤═══════════════╤═════════╤═══════╤════════╤════════════════╕
+│ Index   │ Metric        │ Avg     │ Min   │ Max    │ Unit           │
+╞═════════╪═══════════════╪═════════╪═══════╪════════╪════════════════╡
+│ 10.1.0  │ VALU - Vector │         │       │        │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.1  │ VMEM          │         │       │        │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.2  │ LDS           │ 1541.85 │ 0.0   │ 1572.0 │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.3  │ VALU - MFMA   │         │       │        │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.4  │ SALU          │ 287.03  │ 11.0  │ 360.58 │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.5  │ SMEM          │ 5.2     │ 4.53  │ 10.13  │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.6  │ Branch        │ 124.67  │ 6.0   │ 127.0  │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼────────┼────────────────┤
+│ 10.1.7  │ GDS           │ 0.0     │ 0.0   │ 0.0    │ Instr per wave │
+╘═════════╧═══════════════╧═════════╧═══════╧════════╧════════════════╛
+10.2 VALU Arithmetic Instr Mix
+╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric     │ Avg   │ Min   │ Max   │ Unit           │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 10.2.0  │ INT-32     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.1  │ INT-64     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.2  │ F16-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.3  │ F16-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.4  │ F16-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.5  │ F16-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.6  │ F32-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.7  │ F32-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.8  │ F32-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.9  │ F32-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.10 │ F64-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.11 │ F64-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.12 │ F64-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.13 │ F64-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.14 │ Conversion │       │       │       │ Instr per wave │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════╤═════════╕
+│ Index   │ Type          │   Count │
+╞═════════╪═══════════════╪═════════╡
+│ 10.3.0  │ Buffer Instr  │  757.57 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.1  │ Buffer Read   │  753.61 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.2  │ Buffer Write  │    4.23 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.3  │ Buffer Atomic │    0.00 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.4  │ Flat Instr    │    0.02 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.5  │ Flat Read     │    0.00 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.6  │ Flat Write    │    0.02 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.7  │ Flat Atomic   │    0.00 │
+╘═════════╧═══════════════╧═════════╛
+10.4 MFMA Arithmetic Instr Mix
+╒═════════╤═══════════╤═════════╕
+│ Index   │ Type      │ Count   │
+╞═════════╪═══════════╪═════════╡
+│ 10.4.0  │ MFMA-I8   │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.1  │ MFMA-F16  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.2  │ MFMA-BF16 │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.3  │ MFMA-F32  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.4  │ MFMA-F64  │         │
+╘═════════╧═══════════╧═════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.1 Speed-of-Light
+╒═════════╤═════════════════════╤═════════╕
+│ Index   │ Metric              │ Value   │
+╞═════════╪═════════════════════╪═════════╡
+│ 11.1.0  │ valu_flops_pop      │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.1  │ mfma_flops_bf16_pop │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.2  │ mfma_flops_f16_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.3  │ mfma_flops_f32_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.4  │ mfma_flops_f64_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.5  │ mfma_flops_i8_pop   │         │
+╘═════════╧═════════════════════╧═════════╛
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC (Avg)           │ 0.15  │ 0.14  │ 0.52  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issue)         │ 0.55  │ 0.55  │ 0.91  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.6   │ 0.39  │ 11.24 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 9.35  │ 8.85  │ 13.12 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VALU Active Threads │ 64.0  │ 63.98 │ 64.27 │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ MFMA Util           │       │       │       │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+11.3 Arithmetic Operations
+╒═════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric        │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.3.0  │ FLOPs (Total) │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.1  │ INT8 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.2  │ F16 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.3  │ BF16 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.4  │ F32 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.5  │ F64 OPs       │       │       │       │ Ops per wave │
+╘═════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.1 Speed-of-Light
+╒═════════╤═════════════════════════╤═════════╤═════════════╕
+│ Index   │ Metric                  │   Value │ Unit        │
+╞═════════╪═════════════════════════╪═════════╪═════════════╡
+│ 12.1.0  │ Utilization             │   19.64 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.1  │ Access Rate             │   12.72 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.2  │ Bandwidth (Pct-of-Peak) │   15.44 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.3  │ Bank Conflict Rate      │    0.52 │ Pct of peak │
+╘═════════╧═════════════════════════╧═════════╧═════════════╛
+12.2 LDS Stats
+╒═════════╤══════════════════════╤════════════╤═══════╤════════════╤══════════════════╕
+│ Index   │ Metric               │        Avg │   Min │        Max │ Unit             │
+╞═════════╪══════════════════════╪════════════╪═══════╪════════════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs           │    1541.85 │  0.00 │    1572.00 │ Instr per wave   │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.1  │ Bandwidth            │ 1554756.73 │  0.00 │ 1585282.00 │ Bytes per wave   │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.2  │ Bank Conficts/Access │       0.17 │  0.17 │       0.17 │ Conflicts/access │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.3  │ Index Accesses       │   14186.65 │  0.00 │   14465.02 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.4  │ Atomic Cycles        │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.5  │ Bank Conflict        │    2040.11 │  0.00 │    2080.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.6  │ Addr Conflict        │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.7  │ Unaligned Stall      │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.8  │ Mem Violations       │       0.00 │  0.00 │       0.00 │ per wave         │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.9  │ LDS Latency          │      68.72 │ 68.65 │      68.78 │ Cycles           │
+╘═════════╧══════════════════════╧════════════╧═══════╧════════════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+13. Instruction Cache
+13.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 13.1.0  │ Bandwidth │    6.49 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 13.1.1  │ Cache Hit │   99.97 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+13.2 Instruction Cache Accesses
+╒═════════╤═════════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ L1I Metric              │    Mean │   Min │     Max │ Unit            │
+╞═════════╪═════════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 13.2.0  │ Req                     │ 1935.37 │  9.00 │ 2011.68 │ Req per wave    │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.1  │ Hits                    │ 1935.32 │  8.89 │ 2010.26 │ Hits per wave   │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.2  │ Misses - Non Duplicated │    0.00 │  0.00 │    0.13 │ Misses per wave │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.3  │ Misses - Duplicated     │    0.06 │  0.00 │    1.32 │ Misses per wave │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.4  │ Cache Hit               │   99.97 │ 98.64 │  100.00 │ Pct             │
+╘═════════╧═════════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+14. Scalar L1 Data Cache
+14.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 14.1.0  │ Bandwidth │    0.28 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 14.1.1  │ Cache Hit │   94.20 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+14.2 Scalar L1D Cache Accesses
+╒═════════╤═════════════════════════╤════════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                  │   Mean │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════╪════════╪═══════╪═══════╪══════════════╡
+│ 14.2.0  │ Req                     │   5.03 │  4.53 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.1  │ Hits                    │   4.74 │  4.27 │  6.94 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.2  │ Misses - Non Duplicated │   0.02 │  0.00 │  0.03 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.3  │ Misses- Duplicated      │   0.27 │  0.06 │  0.28 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.4  │ Cache Hit               │  94.20 │ 93.92 │ 99.12 │ Pct          │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.5  │ Read Req (Total)        │   5.04 │  4.53 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.6  │ Atomic Req              │   0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.7  │ Read Req (1 DWord)      │   1.02 │  0.91 │  2.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.8  │ Read Req (2 DWord)      │   2.01 │  1.81 │  3.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.9  │ Read Req (4 DWord)      │   1.99 │  1.00 │  2.29 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.10 │ Read Req (8 DWord)      │   0.02 │  0.00 │  1.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.11 │ Read Req (16 DWord)     │   0.01 │  0.00 │  0.21 │ Req per wave │
+╘═════════╧═════════════════════════╧════════╧═══════╧═══════╧══════════════╛
+14.3 Scalar L1D Cache - L2 Interface
+╒═════════╤════════════╤════════╤═══════╤═══════╤═════════════════╕
+│ Index   │ Metric     │   Mean │   Min │   Max │ Unit            │
+╞═════════╪════════════╪════════╪═══════╪═══════╪═════════════════╡
+│ 14.3.0  │ Read Req   │   0.05 │  0.00 │  1.43 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.1  │ Write Req  │   0.03 │  0.00 │  1.95 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.2  │ Atomic Req │   0.00 │  0.00 │  0.00 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.3  │ Stall      │   0.00 │  0.00 │  0.01 │ Cycles per wave │
+╘═════════╧════════════╧════════╧═══════╧═══════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+15. Texture Addresser and Texture Data (TA/TD)
+15.1 TA
+╒═════════╤════════════════════════╤══════════╤═══════╤══════════╤═════════════════╕
+│ Index   │ Metric                 │      Avg │   Min │      Max │ Unit            │
+╞═════════╪════════════════════════╪══════════╪═══════╪══════════╪═════════════════╡
+│ 15.1.0  │ TA Busy                │    91.27 │ 31.33 │    94.02 │ Pct             │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.1  │ TC2TA Addr Stall       │    73.15 │  5.48 │    75.83 │ Pct             │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.2  │ TC2TA Data Stall       │     0.67 │  0.38 │    15.66 │ Pct             │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.3  │ TD2TA Addr Stall       │     0.00 │  0.00 │     0.00 │ Pct             │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.4  │ Total Instructions     │   757.21 │  1.00 │   772.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.5  │ Flat Instr             │     0.02 │  0.00 │     1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.6  │ Flat Read Instr        │     0.00 │  0.00 │     0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.7  │ Flat Write Instr       │     0.02 │  0.00 │     1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.8  │ Flat Atomic Instr      │     0.00 │  0.00 │     0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.9  │ Buffer Instr           │   757.57 │  0.00 │   772.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.10 │ Buffer Read Instr      │   753.61 │  0.00 │   780.63 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.11 │ Buffer Write Instr     │     4.23 │  0.00 │    15.28 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.12 │ Buffer Atomic Instr    │     0.00 │  0.00 │     0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.13 │ Buffer Total Cylces    │ 12115.13 │  0.00 │ 12352.00 │ Cycles per wave │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.14 │ Buffer Coalesced Read  │     2.36 │  0.00 │    69.58 │ Cycles per wave │
+├─────────┼────────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 15.1.15 │ Buffer Coalesced Write │     1.86 │  0.00 │    54.75 │ Cycles per wave │
+╘═════════╧════════════════════════╧══════════╧═══════╧══════════╧═════════════════╛
+15.2 TD
+╒═════════╤═══════════════════╤════════╤═══════╤════════╤════════════════╕
+│ Index   │ Metric            │ Avg    │ Min   │ Max    │ Unit           │
+╞═════════╪═══════════════════╪════════╪═══════╪════════╪════════════════╡
+│ 15.2.0  │ TD Busy           │ 95.04  │ 49.69 │ 96.86  │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.1  │ TC2TD Stall       │ 77.76  │ 48.77 │ 78.99  │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.2  │ SPI2TD Stall      │        │       │        │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.3  │ Coalescable Instr │ 0.61   │ 0.0   │ 22.57  │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.4  │ Load Instr        │ 753.92 │ 0.0   │ 791.91 │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.5  │ Store Instr       │ 3.94   │ 1.0   │ 4.0    │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.6  │ Atomic Instr      │ 0.0    │ 0.0   │ 0.0    │ Instr per wave │
+╘═════════╧═══════════════════╧════════╧═══════╧════════╧════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.1 Speed-of-Light
+╒═════════╤═══════════════════╤═════════╤═════════════╕
+│ Index   │ Metric            │   Value │ Unit        │
+╞═════════╪═══════════════════╪═════════╪═════════════╡
+│ 16.1.0  │ Buffer Coalescing │   25.00 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.1  │ Cache Util        │   97.25 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.2  │ Cache BW          │   30.97 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.3  │ Cache Hit         │   50.69 │ Pct of peak │
+╘═════════╧═══════════════════╧═════════╧═════════════╛
+16.2 L1D Cache Stalls
+╒═════════╤════════════════════════╤════════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                 │   Mean │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════╪════════╪═══════╪═══════╪════════╡
+│ 16.2.0  │ Stalled on L2 Data     │  62.20 │ 60.54 │ 72.87 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.1  │ Stalled on L2 Req      │   9.53 │  8.93 │ 29.29 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.2  │ Tag RAM Stall (Read)   │  16.93 │  0.00 │ 17.47 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.3  │ Tag RAM Stall (Write)  │   0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.4  │ Tag RAM Stall (Atomic) │   0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════╧════════╧═══════╧═══════╧════════╛
+16.3 L1D Cache Accesses
+╒═════════╤═════════════════════╤═══════════╤═══════════╤═══════════╤════════════════╕
+│ Index   │ Metric              │       Avg │       Min │       Max │ Unit           │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪═══════════╪════════════════╡
+│ 16.3.0  │ Total Req           │  48461.65 │     64.00 │  49408.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.1  │ Read Req            │  48209.44 │      0.00 │  49152.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.2  │ Write Req           │    252.21 │     64.00 │    256.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.3  │ Atomic Req          │      0.00 │      0.00 │      0.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.4  │ Cache BW            │   3572.21 │   1566.60 │   3809.67 │ Gb/s           │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.5  │ Cache Accesses      │  24173.00 │     16.00 │  24793.98 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.6  │ Cache Hits          │  12255.53 │      8.00 │  12630.31 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.7  │ Cache Hit Rate      │     50.69 │     50.00 │     50.96 │ Pct            │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.8  │ Invalidate          │      0.05 │      0.00 │      0.28 │ per wave       │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.9  │ L1-L2 BW            │ 762718.30 │ 762718.30 │ 762718.30 │ Bytes per wave │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.10 │ L1-L2 Read          │  11853.52 │      0.00 │  12134.78 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.11 │ L1-L2 Write         │     63.95 │      8.00 │    117.17 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.12 │ L1-L2 Atomic        │      0.00 │      0.00 │      0.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.13 │ L1 Access Latency   │    628.36 │    612.32 │    764.29 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.14 │ L1-L2 Read Latency  │    242.64 │    238.04 │    247.70 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.15 │ L1-L2 Write Latency │    231.30 │    114.32 │    382.11 │ Cycles         │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧═══════════╧════════════════╛
+16.4 L1D - L2 Transactions
+╒═════════╤═════════════╤════════╤═════════════╤══════════╤═══════╤══════════╤══════════════╕
+│ Index   │ Metric      │ Xfer   │ Coherency   │      Avg │   Min │      Max │ Unit         │
+╞═════════╪═════════════╪════════╪═════════════╪══════════╪═══════╪══════════╪══════════════╡
+│ 16.4.0  │ NC - Read   │ Read   │ NC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.1  │ UC - Read   │ Read   │ UC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.2  │ CC - Read   │ Read   │ CC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.3  │ RW - Read   │ Read   │ RW          │ 11849.90 │  0.00 │ 12125.38 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.4  │ RW - Write  │ Write  │ RW          │    62.91 │  8.00 │    64.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.5  │ NC - Write  │ Write  │ NC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.6  │ UC - Write  │ Write  │ UC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.7  │ CC - Write  │ Write  │ CC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.8  │ NC - Atomic │ Atomic │ NC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.9  │ UC - Atomic │ Atomic │ UC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.10 │ CC - Atomic │ Atomic │ CC          │     0.00 │  0.00 │     0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼──────────┼───────┼──────────┼──────────────┤
+│ 16.4.11 │ RW - Atomic │ Atomic │ RW          │     0.00 │  0.00 │     0.00 │ Req per wave │
+╘═════════╧═════════════╧════════╧═════════════╧══════════╧═══════╧══════════╧══════════════╛
+16.5 L1D Addr Translation
+╒═════════╤══════════════════════╤══════════╤═══════╤══════════╤═════════════════╕
+│ Index   │ Metric               │     Mean │   Min │      Max │ Units           │
+╞═════════╪══════════════════════╪══════════╪═══════╪══════════╪═════════════════╡
+│ 16.5.0  │ Req                  │ 24167.77 │ 16.00 │ 24640.00 │ Req per wave    │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.1  │ Hit Ratio            │    99.44 │ 98.03 │    99.78 │ Pct             │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.2  │ Hits                 │ 24038.42 │ 15.68 │ 24585.12 │ Hits per wave   │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.3  │ Misses (Translation) │    66.74 │  0.03 │    98.46 │ Misses per wave │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.4  │ Misses (Permission)  │     0.00 │  0.00 │     0.00 │ Misses per wave │
+╘═════════╧══════════════════════╧══════════╧═══════╧══════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+17. L2 Cache
+17.1 Speed-of-Light
+╒═════════╤═════════════╤═════════╤════════╕
+│ Index   │ Metric      │   Value │ Unit   │
+╞═════════╪═════════════╪═════════╪════════╡
+│ 17.1.0  │ L2 Util     │   97.70 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.1  │ Cache Hit   │   76.63 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.2  │ L2-EA Rd BW │  381.74 │ Gb/s   │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.3  │ L2-EA Wr BW │   22.94 │ Gb/s   │
+╘═════════╧═════════════╧═════════╧════════╛
+17.2 L2 - Fabric Transactions
+╒═════════╤══════════════════════╤═══════════╤════════╤═══════════╤════════════════╕
+│ Index   │ Metric               │ Avg       │ Min    │ Max       │ Unit           │
+╞═════════╪══════════════════════╪═══════════╪════════╪═══════════╪════════════════╡
+│ 17.2.0  │ Read BW              │ 166610.02 │ 0.38   │ 188666.27 │ Bytes per wave │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.1  │ Write BW             │ 4022.93   │ 512.0  │ 4111.3    │ Bytes per wave │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.2  │ Read (32B)           │ 0.0       │ 0.0    │ 0.0       │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.3  │ Read (Uncached 32B)  │ 0.26      │ 0.01   │ 0.47      │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.4  │ Read (64B)           │ 2603.28   │ 0.01   │ 2947.91   │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.5  │ HBM Read             │ 2593.68   │ 0.01   │ 2848.43   │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.6  │ Write (32B)          │ 0.0       │ 0.0    │ 0.0       │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.7  │ Write (Uncached 32B) │ 0.0       │ 0.0    │ 0.0       │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.8  │ Write (64B)          │ 62.86     │ 8.0    │ 64.24     │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.9  │ HBM Write            │ 62.86     │ 8.0    │ 64.2      │ Req per wave   │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.10 │ Read Latency         │ 276.55    │ 226.52 │ 1022.34   │ Cycles         │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.11 │ Write Latency        │ 403.15    │ 266.19 │ 426.91    │ Cycles         │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.12 │ Atomic Latency       │           │        │           │ Cycles         │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.13 │ Read Stall           │ 0.0       │ 0.0    │ 0.0       │ Pct            │
+├─────────┼──────────────────────┼───────────┼────────┼───────────┼────────────────┤
+│ 17.2.14 │ Write Stall          │ 0.07      │ 0.0    │ 3.99      │ Pct            │
+╘═════════╧══════════════════════╧═══════════╧════════╧═══════════╧════════════════╛
+17.3 L2 Cache Accesses
+╒═════════╤════════════════════╤══════════╤═══════╤══════════╤═════════════════╕
+│ Index   │ Metric             │      Avg │   Min │      Max │ Unit            │
+╞═════════╪════════════════════╪══════════╪═══════╪══════════╪═════════════════╡
+│ 17.3.0  │ Req                │ 11920.42 │  8.01 │ 12200.55 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.1  │ Streaming Req      │     0.00 │  0.00 │     0.00 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.2  │ Read Req           │ 11899.84 │  0.01 │ 12272.77 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.3  │ Write Req          │    65.38 │  8.00 │   136.57 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.4  │ Atomic Req         │     0.00 │  0.00 │     0.00 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.5  │ Probe Req          │     0.00 │  0.00 │     0.03 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.6  │ Hits               │  9297.13 │  0.01 │  9753.46 │ Hits per wave   │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.7  │ Misses             │  2623.28 │  8.01 │  2867.39 │ Misses per wave │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.8  │ Cache Hit          │    76.63 │  0.10 │    80.16 │ Pct             │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.9  │ Writeback          │    65.24 │  8.00 │   135.00 │ per wave        │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.10 │ NC Req             │     0.04 │  0.00 │     0.39 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.11 │ UC Req             │     0.14 │  0.01 │     0.37 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.12 │ CC Req             │     0.00 │  0.00 │     0.00 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.13 │ RW Req             │ 11843.76 │  8.01 │ 12248.32 │ Req per wave    │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.14 │ Writeback (Normal) │    62.92 │  6.80 │    70.86 │ per wave        │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.15 │ Writeback (TC Req) │     0.09 │  0.00 │     2.89 │ per wave        │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.16 │ Evict (Normal)     │  2608.75 │  5.87 │  2918.34 │ per wave        │
+├─────────┼────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 17.3.17 │ Evict (TC Req)     │     0.00 │  0.00 │     0.00 │ per wave        │
+╘═════════╧════════════════════╧══════════╧═══════╧══════════╧═════════════════╛
+17.4 L2 - EA Interface Stalls
+╒═════════╤═════════════════════════════╤═════════════════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                      │ Type                │ Transaction   │   Avg │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════════╪═════════════════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 17.4.0  │ Read - Remote Socket Stall  │ Remote Socket Stall │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.1  │ Read - Peer GCD Stall       │ Peer GCD Stall      │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.2  │ Read - HBM Stall            │ HBM Stall           │ Read          │  0.00 │  0.00 │  0.03 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.3  │ Write - Remote Socket Stall │ Remote Socket Stall │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.4  │ Write - Peer GCD Stall      │ Peer GCD Stall      │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.5  │ Write - HBM Stall           │ HBM Stall           │ Write         │  0.02 │  0.00 │  1.05 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.6  │ Write - Credit Starvation   │ Credit Starvation   │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+╘═════════╧═════════════════════════════╧═════════════════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+18. L2 Cache (per Channel)
+18.1 Aggregate Stats (All 32 channels)
+╒═════════╤════════════════════════════╤════════╤═══════════╤════════╤════════╤═════════════════╕
+│ Index   │ Metric                     │ Mean   │ Std Dev   │ Min    │ Max    │ Units           │
+╞═════════╪════════════════════════════╪════════╪═══════════╪════════╪════════╪═════════════════╡
+│ 18.1.0  │ L2 Cache Hit Rate          │ 76.34  │ 10.32     │ 0.1    │ 79.78  │ Pct             │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.1  │ Req                        │ 372.4  │ 50.4      │ 0.25   │ 381.09 │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.2  │ L1 - L2 Read Req           │ 370.43 │ 50.17     │ 0.0    │ 379.09 │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.3  │ L1 - L2 Write Req          │ 1.97   │ 0.23      │ 0.25   │ 2.0    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.4  │ L1 - L2 Atomic Req         │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.5  │ L2 - EA Read Req           │ 80.68  │ 11.58     │ 0.0    │ 89.49  │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.6  │ L2 - EA Write Req          │ 1.96   │ 0.23      │ 0.25   │ 2.01   │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.7  │ L2 - EA Atomic Req         │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.8  │ L2 - EA Read Lat           │ 261.78 │ 33.87     │ 217.18 │ 479.54 │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.9  │ L2 - EA Write Lat          │ 402.1  │ 23.45     │ 256.74 │ 429.72 │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.10 │ L2 - EA Atomic Lat         │        │           │        │        │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.11 │ L2 - EA Read Stall (IO)    │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.12 │ L2 - EA Read Stall (GMI)   │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.13 │ L2 - EA Read Stall (DRAM)  │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.14 │ L2 - EA Write Stall (IO)   │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.15 │ L2 - EA Write Stall (GMI)  │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.16 │ L2 - EA Write Stall (DRAM) │ 0.0    │ 0.0       │ 0.0    │ 0.04   │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼────────┼─────────────────┤
+│ 18.1.17 │ L2 - EA Write Starve       │ 0.0    │ 0.0       │ 0.0    │ 0.0    │ Cycles per wave │
+╘═════════╧════════════════════════════╧════════╧═══════════╧════════╧════════╧═════════════════╛
+18.2 Channel 0-15
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.2.0   │ 18.2.1   │ 18.2.2   │ 18.2.3   │ 18.2.4   │ 18.2.5   │ 18.2.6   │ 18.2.7   │ 18.2.8   │ 18.2.9   │ 18.2.10   │ 18.2.11   │ 18.2.12   │ 18.2.13   │ 18.2.14   │ 18.2.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 0.0      │ 1.0      │ 2.0      │ 3.0      │ 4.0      │ 5.0      │ 6.0      │ 7.0      │ 8.0      │ 9.0      │ 10.0      │ 11.0      │ 12.0      │ 13.0      │ 14.0      │ 15.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 76.5     │ 76.22    │ 76.43    │ 76.22    │ 76.45    │ 76.22    │ 76.46    │ 76.21    │ 76.47    │ 76.21    │ 76.47     │ 76.19     │ 76.48     │ 76.22     │ 76.47     │ 76.21     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 372.37   │ 372.42   │ 372.4    │ 372.38   │ 372.37   │ 372.43   │ 372.43   │ 372.38   │ 372.37   │ 372.48   │ 372.4     │ 372.38    │ 372.37    │ 372.46    │ 372.4     │ 372.38    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 370.4    │ 370.46   │ 370.43   │ 370.41   │ 370.4    │ 370.46   │ 370.46   │ 370.42   │ 370.4    │ 370.52   │ 370.43    │ 370.42    │ 370.4     │ 370.49    │ 370.43    │ 370.41    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97      │ 1.97      │ 1.97      │ 1.97      │ 1.97      │ 1.97      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 80.16    │ 81.1     │ 80.21    │ 81.22    │ 80.23    │ 81.19    │ 80.09    │ 81.22    │ 80.14    │ 81.17    │ 80.06     │ 81.33     │ 80.21     │ 81.09     │ 80.12     │ 81.25     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97      │ 1.96      │ 1.97      │ 1.96      │ 1.97      │ 1.96      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 817.86   │ 257.63   │ 254.65   │ 254.07   │ 250.29   │ 253.93   │ 260.88   │ 269.89   │ 253.98   │ 257.46   │ 252.63    │ 252.52    │ 253.76    │ 258.81    │ 260.62    │ 269.93    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 392.26   │ 410.71   │ 389.73   │ 409.58   │ 395.42   │ 411.8    │ 394.54   │ 419.17   │ 389.87   │ 410.54   │ 390.22    │ 406.97    │ 394.58    │ 410.47    │ 394.87    │ 418.87    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+18.3 Channel 16-31
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.3.0   │ 18.3.1   │ 18.3.2   │ 18.3.3   │ 18.3.4   │ 18.3.5   │ 18.3.6   │ 18.3.7   │ 18.3.8   │ 18.3.9   │ 18.3.10   │ 18.3.11   │ 18.3.12   │ 18.3.13   │ 18.3.14   │ 18.3.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 16.0     │ 17.0     │ 18.0     │ 19.0     │ 20.0     │ 21.0     │ 22.0     │ 23.0     │ 24.0     │ 25.0     │ 26.0      │ 27.0      │ 28.0      │ 29.0      │ 30.0      │ 31.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 76.46    │ 76.21    │ 76.45    │ 76.19    │ 76.45    │ 76.23    │ 76.47    │ 76.21    │ 76.46    │ 76.23    │ 76.45     │ 76.19     │ 76.48     │ 76.2      │ 76.48     │ 76.19     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 372.37   │ 372.43   │ 372.4    │ 372.38   │ 372.37   │ 372.43   │ 372.4    │ 372.38   │ 372.37   │ 372.43   │ 372.4     │ 372.38    │ 372.37    │ 372.43    │ 372.41    │ 372.38    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 370.4    │ 370.46   │ 370.43   │ 370.41   │ 370.4    │ 370.46   │ 370.43   │ 370.42   │ 370.4    │ 370.46   │ 370.43    │ 370.42    │ 370.4     │ 370.46    │ 370.44    │ 370.41    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97     │ 1.97      │ 1.97      │ 1.97      │ 1.97      │ 1.97      │ 1.97      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 80.25    │ 81.12    │ 80.16    │ 81.36    │ 80.23    │ 81.11    │ 80.06    │ 81.3     │ 80.17    │ 81.14    │ 80.11     │ 81.33     │ 80.16     │ 81.16     │ 80.03     │ 81.32     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97     │ 1.96     │ 1.97      │ 1.96      │ 1.97      │ 1.96      │ 1.97      │ 1.96      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 255.49   │ 254.77   │ 252.49   │ 251.98   │ 254.84   │ 258.48   │ 260.58   │ 269.61   │ 250.99   │ 254.54   │ 252.67    │ 252.28    │ 255.05    │ 258.84    │ 260.6     │ 269.93    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 391.56   │ 407.65   │ 387.03   │ 408.66   │ 394.04   │ 411.54   │ 397.58   │ 419.31   │ 391.92   │ 406.16   │ 388.48    │ 405.07    │ 392.73    │ 412.14    │ 394.2     │ 419.76    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+
diff --git a/results/analyse_k2.txt b/results/analyse_k2.txt
new file mode 100644
index 00000000000..270e1097c56
--- /dev/null
+++ b/results/analyse_k2.txt
@@ -0,0 +1,967 @@
+
+--------
+Analyze
+--------
+
+
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═════════════╤════════════╤══════════════╤═══════╕
+│    │ KernelName                               │   Count │     Sum(ns) │   Mean(ns) │   Median(ns) │   Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═════════════╪════════════╪══════════════╪═══════╡
+│  0 │ void kernel_gemm_xdlops_v2r4r2_simplifie │   56.00 │ 84819982.00 │ 1514642.54 │   1522404.50 │ 99.87 │
+│    │ d<GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v │         │             │            │              │       │
+│    │ 2r4r2, true, (InMemoryDataOperationEn... │         │             │            │              │       │
+├────┼──────────────────────────────────────────┼─────────┼─────────────┼────────────┼──────────────┼───────┤
+│  1 │ __amd_rocclr_fillBufferAligned.kd        │    3.00 │   107520.00 │   35840.00 │     37600.00 │  0.13 │
+╘════╧══════════════════════════════════════════╧═════════╧═════════════╧════════════╧══════════════╧═══════╛
+
+
+--------------------------------------------------------------------------------
+1. System Info
+╒══════════════════╤══════════════════════════════════════════════════════════════════════╕
+│                  │ Info                                                                 │
+╞══════════════════╪══════════════════════════════════════════════════════════════════════╡
+│ workload_name    │ bin                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ command          │ ./example_splitK_gemm_xdl_fp16 1 2 1 2 3840 4096 4096 4096 4096 4096 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_name        │ t008-004.hpcfund                                                     │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_cpu         │ AMD EPYC 7V13 64-Core Processor                                      │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_distro      │ Rocky Linux 9.1 (Blue Onyx)                                          │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_kernel      │ 5.14.0-162.18.1.el9_1.x86_64                                         │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_rocmver     │ 5.7.1-98                                                             │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ date             │ Fri May 24 01:02:32 2024 (CDT)                                       │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ gpu_soc          │ gfx908                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSE            │ 8                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numCU            │ 120                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSIMD          │ 4                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ waveSize         │ 64                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWavesPerCU    │ 40                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWorkgroupSize │ 1024                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L1               │ 16                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2               │ 8192                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ sclk             │ 1502                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ mclk             │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_sclk         │ 300                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_mclk         │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2Banks          │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ LDSBanks         │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ name             │ mi100                                                                │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSQC           │ 48                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ hbmBW            │ 1228.8                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ ip_blocks        │ SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF                                 │
+╘══════════════════╧══════════════════════════════════════════════════════════════════════╛
+
+
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═══════════════════════════╤═════════╤══════════════════╤═══════════╤═══════╕
+│ Index   │ Metric                    │ Value   │ Unit             │ Peak      │ PoP   │
+╞═════════╪═══════════════════════════╪═════════╪══════════════════╪═══════════╪═══════╡
+│ 2.1.0   │ VALU FLOPs                │         │ Gflops           │ 23070.72  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.1   │ VALU IOPs                 │         │ Gops             │ 23070.72  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.2   │ MFMA FLOPs (BF16)         │         │ Gflops           │ 92282.88  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.3   │ MFMA FLOPs (F16)          │         │ Gflops           │ 184565.76 │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.4   │ MFMA FLOPs (F32)          │         │ Gflops           │ 46141.44  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.5   │ MFMA FLOPs (F64)          │         │ Gflops           │ 46141.44  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.6   │ MFMA IOPs (Int8)          │         │ Gops             │ 184565.76 │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.7   │ Active CUs                │ 101.0   │ Cus              │ 120.0     │ 84.17 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.8   │ SALU Util                 │ 0.92    │ Pct              │ 100.0     │ 0.92  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.9   │ VALU Util                 │ 10.59   │ Pct              │ 100.0     │ 10.59 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.10  │ MFMA Util                 │         │ Pct              │ 100.0     │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.11  │ VALU Active Threads/Wave  │ 64.0    │ Threads          │ 64.0      │ 100.0 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.12  │ IPC - Issue               │ 0.53    │ Instr/cycle      │ 5.0       │ 10.68 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.13  │ LDS BW                    │ 2957.43 │ Gb/sec           │ 23070.72  │ 12.82 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.14  │ LDS Bank Conflict         │ 0.11    │ Conflicts/access │ 32.0      │ 0.33  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.15  │ Instr Cache Hit Rate      │ 100.0   │ Pct              │ 100.0     │ 100.0 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.16  │ Instr Cache BW            │ 313.44  │ Gb/s             │ 4614.14   │ 6.79  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.17  │ Scalar L1D Cache Hit Rate │ 98.9    │ Pct              │ 100.0     │ 98.9  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.18  │ Scalar L1D Cache BW       │ 41.19   │ Gb/s             │ 4614.14   │ 0.89  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.19  │ Vector L1D Cache Hit Rate │ 37.13   │ Pct              │ 100.0     │ 37.13 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.20  │ Vector L1D Cache BW       │ 2142.88 │ Gb/s             │ 11535.36  │ 18.58 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.21  │ L2 Cache Hit Rate         │ 84.47   │ Pct              │ 100.0     │ 84.47 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.22  │ L2-Fabric Read BW         │ 143.54  │ Gb/s             │ 1228.8    │ 11.68 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.23  │ L2-Fabric Write BW        │ 83.33   │ Gb/s             │ 1228.8    │ 6.78  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.24  │ L2-Fabric Read Latency    │ 292.11  │ Cycles           │           │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.25  │ L2-Fabric Write Latency   │ 232.92  │ Cycles           │           │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.26  │ Wave Occupancy            │ 920.47  │ Wavefronts       │ 4800.0    │ 19.18 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.27  │ Instr Fetch BW            │ 156.53  │ Gb/s             │ 2307.07   │ 6.78  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.28  │ Instr Fetch Latency       │ 16.25   │ Cycles           │           │       │
+╘═════════╧═══════════════════════════╧═════════╧══════════════════╧═══════════╧═══════╛
+
+
+--------------------------------------------------------------------------------
+5. Command Processor (CPC/CPF)
+5.1 Command Processor Fetcher
+╒═════════╤════════════════════╤════════════╤══════════╤════════════╤═══════════════╕
+│ Index   │ Metric             │        Avg │      Min │        Max │ Unit          │
+╞═════════╪════════════════════╪════════════╪══════════╪════════════╪═══════════════╡
+│ 5.1.0   │ GPU Busy Cycles    │ 2011766.97 │ 58520.00 │ 2355734.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.1   │ CPF Busy           │ 2011766.98 │ 58520.00 │ 2355735.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.2   │ CPF Util           │     100.00 │   100.00 │     100.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.3   │ CPF Stall          │       0.07 │     0.00 │       0.25 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.4   │ L2Cache Intf Busy  │   14176.10 │  1830.00 │   48109.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.5   │ L2Cache Intf Util  │       0.81 │     0.16 │       3.41 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.6   │ L2Cache Intf Stall │       0.00 │     0.00 │       0.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.7   │ UTCL1 Stall        │    1110.39 │     0.00 │    1989.00 │ Cycles/kernel │
+╘═════════╧════════════════════╧════════════╧══════════╧════════════╧═══════════════╛
+5.2 Command Processor Compute
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit   │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪════════╡
+│ 5.2.0   │ GPU Busy Cycles        │ 2011766.97 │ 58520.00 │ 2355734.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.1   │ CPC Busy Cycles        │ 2011710.88 │ 58520.00 │ 2352425.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.2   │ CPC Util               │      99.71 │    82.88 │     100.00 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.3   │ CPC Stall Cycles       │   80837.34 │ 10058.00 │   88832.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.4   │ CPC Stall Rate         │       4.70 │     3.11 │      22.75 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.5   │ CPC Packet Decoding    │  482524.90 │  9850.00 │  530421.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.6   │ SPI Intf Busy Cycles   │ 1521175.49 │ 40150.00 │ 1650457.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.7   │ SPI Intf Util          │      74.95 │    53.72 │      78.29 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.8   │ L2Cache Intf Util      │       0.08 │     0.02 │       1.68 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.9   │ UTCL1 Stall Cycles     │    5613.22 │  3152.00 │    6607.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.10  │ UTCL2 Intf Busy Cycles │    5410.98 │   910.00 │    7038.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.11  │ UTCL2 Intf Util        │       0.42 │     0.04 │       5.25 │ Pct    │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+6. Shader Processor Input (SPI)
+6.1 SPI Stats
+╒═════════╤════════════════════════╤═════════════╤═══════════╤═════════════╤════════════╕
+│ Index   │ Metric                 │         Avg │       Min │         Max │ Unit       │
+╞═════════╪════════════════════════╪═════════════╪═══════════╪═════════════╪════════════╡
+│ 6.1.0   │ GPU Busy               │  2011766.97 │  58520.00 │  2355734.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.1   │ CS Busy                │ 15946675.29 │ 350967.00 │ 17906387.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.2   │ SPI Busy               │  2000035.46 │  44513.00 │  2565209.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.3   │ SQ Busy                │ 15936725.56 │ 343370.00 │ 17888266.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.4   │ Dispatched Workgroups  │     1695.75 │    960.00 │    15360.00 │ Workgroups │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.5   │ Dispatched Wavefronts  │     6782.98 │   3840.00 │    61440.00 │ Wavefronts │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.6   │ Wave Alloc Failed      │  3033083.92 │  14506.00 │  3220696.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.7   │ Wave Alloc Failed - CS │   107010.58 │  75946.00 │   144793.00 │ Cycles     │
+╘═════════╧════════════════════════╧═════════════╧═══════════╧═════════════╧════════════╛
+6.2 SPI Resource Allocation
+╒═════════╤═════════════════════════════╤══════════════╤══════════╤══════════════╤═════════════╕
+│ Index   │ Metric                      │          Avg │      Min │          Max │ Unit        │
+╞═════════╪═════════════════════════════╪══════════════╪══════════╪══════════════╪═════════════╡
+│ 6.2.0   │ Wave request Failed (CS)    │    107010.58 │ 75946.00 │    144793.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.1   │ CS Stall                    │   2926355.64 │     0.00 │   3110205.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.2   │ CS Stall Rate               │       139.14 │     0.00 │       148.23 │ Pct         │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.3   │ Scratch Stall               │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots │         0.00 │     0.00 │         0.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs     │ 175606704.29 │     0.00 │ 186670991.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs     │         0.00 │     0.00 │         0.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.7   │ Insufficient CU LDS         │  43866808.47 │     0.00 │  46622430.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.8   │ Insufficient CU Barries     │         0.00 │     0.00 │         0.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.9   │ Insufficient Bulky Resource │         0.00 │     0.00 │         0.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.10  │ Reach CU Threadgroups Limit │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.11  │ Reach CU Wave Limit         │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.12  │ VGPR Writes                 │         3.99 │     3.28 │         4.00 │ Cycles/wave │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.13  │ SGPR Writes                 │         5.93 │     4.93 │         6.00 │ Cycles/wave │
+╘═════════╧═════════════════════════════╧══════════════╧══════════╧══════════════╧═════════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤═════════════════════╤═══════════╤═══════════╤════════════╤════════════╕
+│ Index   │ Metric              │       Avg │       Min │        Max │ Unit       │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪════════════╪════════════╡
+│ 7.1.0   │ Grid Size           │ 433204.07 │ 245760.00 │ 3932160.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.1   │ Workgroup Size      │    256.00 │    256.00 │     256.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.2   │ Total Wavefronts    │   6782.98 │   3840.00 │   61440.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.3   │ Saved Wavefronts    │      0.00 │      0.00 │       0.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.4   │ Restored Wavefronts │      0.00 │      0.00 │       0.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.5   │ VGPRs               │    121.90 │      8.00 │     128.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.6   │ AGPRs               │    121.90 │      8.00 │     128.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.7   │ SGPRs               │     32.00 │     32.00 │      32.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.8   │ LDS Allocation      │  23812.34 │      0.00 │   25088.00 │ Bytes      │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.9   │ Scratch Allocation  │      0.00 │      0.00 │       0.00 │ Bytes      │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧════════════╧════════════╛
+7.2 Wavefront Runtime Stats
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤═════════════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit            │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪═════════════════╡
+│ 7.2.0   │ Kernel Time (Nanosec)  │ 1439449.19 │ 27360.00 │ 1528162.00 │ Ns              │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.1   │ Kernel Time (Cycles)   │ 2011766.97 │ 58520.00 │ 2355734.00 │ Cycle           │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.2   │ Instr/wavefront        │    7374.51 │    46.00 │    8361.70 │ Instr/wavefront │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.3   │ Wave Cycles            │  465975.67 │  1482.93 │  493997.41 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.4   │ Dependency Wait Cycles │  245960.59 │  1186.16 │  294280.03 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.5   │ Issue Wait Cycles      │  180326.39 │    45.62 │  191937.17 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.6   │ Active Cycles          │   40259.82 │   188.01 │   42492.60 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.7   │ Wavefront Occupancy    │     920.47 │   893.67 │    1531.76 │ Wavefronts      │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.1 Instruction Mix
+╒═════════╤═══════════════╤═════════╤═══════╤═════════╤════════════════╕
+│ Index   │ Metric        │ Avg     │ Min   │ Max     │ Unit           │
+╞═════════╪═══════════════╪═════════╪═══════╪═════════╪════════════════╡
+│ 10.1.0  │ VALU - Vector │         │       │         │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.1  │ VMEM          │         │       │         │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.2  │ LDS           │ 1227.71 │ 0.0   │ 1299.06 │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.3  │ VALU - MFMA   │         │       │         │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.4  │ SALU          │ 154.96  │ 11.0  │ 163.0   │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.5  │ SMEM          │ 5.09    │ 4.44  │ 7.0     │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.6  │ Branch        │ 61.24   │ 6.0   │ 137.19  │ Instr per wave │
+├─────────┼───────────────┼─────────┼───────┼─────────┼────────────────┤
+│ 10.1.7  │ GDS           │ 0.0     │ 0.0   │ 0.0     │ Instr per wave │
+╘═════════╧═══════════════╧═════════╧═══════╧═════════╧════════════════╛
+10.2 VALU Arithmetic Instr Mix
+╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric     │ Avg   │ Min   │ Max   │ Unit           │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 10.2.0  │ INT-32     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.1  │ INT-64     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.2  │ F16-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.3  │ F16-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.4  │ F16-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.5  │ F16-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.6  │ F32-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.7  │ F32-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.8  │ F32-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.9  │ F32-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.10 │ F64-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.11 │ F64-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.12 │ F64-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.13 │ F64-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.14 │ Conversion │       │       │       │ Instr per wave │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════╤═════════╕
+│ Index   │ Type          │   Count │
+╞═════════╪═══════════════╪═════════╡
+│ 10.3.0  │ Buffer Instr  │  425.41 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.1  │ Buffer Read   │  364.79 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.2  │ Buffer Write  │    0.99 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.3  │ Buffer Atomic │   60.63 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.4  │ Flat Instr    │    0.05 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.5  │ Flat Read     │    0.00 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.6  │ Flat Write    │    0.05 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.7  │ Flat Atomic   │    0.00 │
+╘═════════╧═══════════════╧═════════╛
+10.4 MFMA Arithmetic Instr Mix
+╒═════════╤═══════════╤═════════╕
+│ Index   │ Type      │ Count   │
+╞═════════╪═══════════╪═════════╡
+│ 10.4.0  │ MFMA-I8   │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.1  │ MFMA-F16  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.2  │ MFMA-BF16 │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.3  │ MFMA-F32  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.4  │ MFMA-F64  │         │
+╘═════════╧═══════════╧═════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.1 Speed-of-Light
+╒═════════╤═════════════════════╤═════════╕
+│ Index   │ Metric              │ Value   │
+╞═════════╪═════════════════════╪═════════╡
+│ 11.1.0  │ valu_flops_pop      │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.1  │ mfma_flops_bf16_pop │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.2  │ mfma_flops_f16_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.3  │ mfma_flops_f32_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.4  │ mfma_flops_f64_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.5  │ mfma_flops_i8_pop   │         │
+╘═════════╧═════════════════════╧═════════╛
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC (Avg)           │ 0.14  │ 0.11  │ 0.56  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issue)         │ 0.53  │ 0.51  │ 0.91  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.92  │ 0.23  │ 15.75 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 10.59 │ 9.28  │ 18.38 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VALU Active Threads │ 64.0  │ 63.98 │ 64.0  │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ MFMA Util           │       │       │       │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+11.3 Arithmetic Operations
+╒═════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric        │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.3.0  │ FLOPs (Total) │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.1  │ INT8 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.2  │ F16 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.3  │ BF16 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.4  │ F32 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.5  │ F64 OPs       │       │       │       │ Ops per wave │
+╘═════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.1 Speed-of-Light
+╒═════════╤═════════════════════════╤═════════╤═════════════╕
+│ Index   │ Metric                  │   Value │ Unit        │
+╞═════════╪═════════════════════════╪═════════╪═════════════╡
+│ 12.1.0  │ Utilization             │   15.25 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.1  │ Access Rate             │    8.53 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.2  │ Bandwidth (Pct-of-Peak) │   12.82 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.3  │ Bank Conflict Rate      │    0.33 │ Pct of peak │
+╘═════════╧═════════════════════════╧═════════╧═════════════╛
+12.2 LDS Stats
+╒═════════╤══════════════════════╤════════════╤═══════╤════════════╤══════════════════╕
+│ Index   │ Metric               │        Avg │   Min │        Max │ Unit             │
+╞═════════╪══════════════════════╪════════════╪═══════╪════════════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs           │    1227.71 │  0.00 │    1299.06 │ Instr per wave   │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.1  │ Bandwidth            │ 1164004.52 │  0.00 │ 1228800.00 │ Bytes per wave   │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.2  │ Bank Conficts/Access │       0.11 │  0.11 │       0.11 │ Conflicts/access │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.3  │ Index Accesses       │   10063.79 │  0.00 │   10624.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.4  │ Atomic Cycles        │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.5  │ Bank Conflict        │     970.00 │  0.00 │    1024.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.6  │ Addr Conflict        │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.7  │ Unaligned Stall      │       0.00 │  0.00 │       0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.8  │ Mem Violations       │       0.00 │  0.00 │       0.00 │ per wave         │
+├─────────┼──────────────────────┼────────────┼───────┼────────────┼──────────────────┤
+│ 12.2.9  │ LDS Latency          │      78.63 │ 78.59 │      78.67 │ Cycles           │
+╘═════════╧══════════════════════╧════════════╧═══════╧════════════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+13. Instruction Cache
+13.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 13.1.0  │ Bandwidth │    6.79 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 13.1.1  │ Cache Hit │   99.94 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+13.2 Instruction Cache Accesses
+╒═════════╤═════════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ L1I Metric              │    Mean │   Min │     Max │ Unit            │
+╞═════════╪═════════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 13.2.0  │ Req                     │ 1608.46 │  9.00 │ 1835.70 │ Req per wave    │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.1  │ Hits                    │ 1608.25 │  8.89 │ 1831.00 │ Hits per wave   │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.2  │ Misses - Non Duplicated │    0.00 │  0.00 │    0.23 │ Misses per wave │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.3  │ Misses - Duplicated     │    0.22 │  0.00 │    4.48 │ Misses per wave │
+├─────────┼─────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 13.2.4  │ Cache Hit               │   99.94 │ 98.65 │  100.00 │ Pct             │
+╘═════════╧═════════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+14. Scalar L1 Data Cache
+14.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 14.1.0  │ Bandwidth │    0.89 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 14.1.1  │ Cache Hit │   85.23 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+14.2 Scalar L1D Cache Accesses
+╒═════════╤═════════════════════════╤════════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                  │   Mean │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════╪════════╪═══════╪═══════╪══════════════╡
+│ 14.2.0  │ Req                     │   5.09 │  4.44 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.1  │ Hits                    │   4.35 │  3.76 │  6.94 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.2  │ Misses - Non Duplicated │   0.05 │  0.00 │  0.07 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.3  │ Misses- Duplicated      │   0.69 │  0.06 │  0.99 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.4  │ Cache Hit               │  85.23 │ 78.61 │ 99.13 │ Pct          │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.5  │ Read Req (Total)        │   5.10 │  4.44 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.6  │ Atomic Req              │   0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.7  │ Read Req (1 DWord)      │   1.05 │  0.89 │  2.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.8  │ Read Req (2 DWord)      │   2.05 │  1.78 │  3.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.9  │ Read Req (4 DWord)      │   1.95 │  1.00 │  2.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.10 │ Read Req (8 DWord)      │   0.05 │  0.00 │  1.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.11 │ Read Req (16 DWord)     │   0.01 │  0.00 │  0.44 │ Req per wave │
+╘═════════╧═════════════════════════╧════════╧═══════╧═══════╧══════════════╛
+14.3 Scalar L1D Cache - L2 Interface
+╒═════════╤════════════╤════════╤═══════╤═══════╤═════════════════╕
+│ Index   │ Metric     │   Mean │   Min │   Max │ Unit            │
+╞═════════╪════════════╪════════╪═══════╪═══════╪═════════════════╡
+│ 14.3.0  │ Read Req   │   0.07 │  0.00 │  1.71 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.1  │ Write Req  │   0.04 │  0.00 │  2.33 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.2  │ Atomic Req │   0.00 │  0.00 │  0.00 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.3  │ Stall      │   0.00 │  0.00 │  0.01 │ Cycles per wave │
+╘═════════╧════════════╧════════╧═══════╧═══════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+15. Texture Addresser and Texture Data (TA/TD)
+15.1 TA
+╒═════════╤════════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ Metric                 │     Avg │   Min │     Max │ Unit            │
+╞═════════╪════════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 15.1.0  │ TA Busy                │   66.82 │ 29.97 │   68.87 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.1  │ TC2TA Addr Stall       │   48.13 │  0.00 │   51.34 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.2  │ TC2TA Data Stall       │    7.34 │  6.22 │   20.67 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.3  │ TD2TA Addr Stall       │    0.00 │  0.00 │    0.00 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.4  │ Total Instructions     │  426.75 │  1.00 │  585.19 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.5  │ Flat Instr             │    0.05 │  0.00 │    1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.6  │ Flat Read Instr        │    0.00 │  0.00 │    0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.7  │ Flat Write Instr       │    0.05 │  0.00 │    1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.8  │ Flat Atomic Instr      │    0.00 │  0.00 │    0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.9  │ Buffer Instr           │  425.41 │  0.00 │  459.19 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.10 │ Buffer Read Instr      │  364.79 │  0.00 │  445.07 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.11 │ Buffer Write Instr     │    0.99 │  0.00 │   58.40 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.12 │ Buffer Atomic Instr    │   60.63 │  0.00 │   64.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.13 │ Buffer Total Cylces    │ 6790.03 │  0.00 │ 7168.00 │ Cycles per wave │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.14 │ Buffer Coalesced Read  │    0.00 │  0.00 │    0.00 │ Cycles per wave │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.15 │ Buffer Coalesced Write │    0.00 │  0.00 │    0.00 │ Cycles per wave │
+╘═════════╧════════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+15.2 TD
+╒═════════╤═══════════════════╤════════╤═══════╤════════╤════════════════╕
+│ Index   │ Metric            │ Avg    │ Min   │ Max    │ Unit           │
+╞═════════╪═══════════════════╪════════╪═══════╪════════╪════════════════╡
+│ 15.2.0  │ TD Busy           │ 73.41  │ 48.73 │ 74.45  │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.1  │ TC2TD Stall       │ 63.49  │ 47.79 │ 65.53  │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.2  │ SPI2TD Stall      │        │       │        │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.3  │ Coalescable Instr │ 1.98   │ 0.0   │ 116.8  │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.4  │ Load Instr        │ 365.78 │ 0.0   │ 503.47 │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.5  │ Store Instr       │ 0.05   │ 0.0   │ 1.0    │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼────────┼────────────────┤
+│ 15.2.6  │ Atomic Instr      │ 60.63  │ 0.0   │ 64.0   │ Instr per wave │
+╘═════════╧═══════════════════╧════════╧═══════╧════════╧════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.1 Speed-of-Light
+╒═════════╤═══════════════════╤═════════╤═════════════╕
+│ Index   │ Metric            │   Value │ Unit        │
+╞═════════╪═══════════════════╪═════════╪═════════════╡
+│ 16.1.0  │ Buffer Coalescing │   25.13 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.1  │ Cache Util        │   76.45 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.2  │ Cache BW          │   18.58 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.3  │ Cache Hit         │   37.13 │ Pct of peak │
+╘═════════╧═══════════════════╧═════════╧═════════════╛
+16.2 L1D Cache Stalls
+╒═════════╤════════════════════════╤════════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                 │   Mean │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════╪════════╪═══════╪═══════╪════════╡
+│ 16.2.0  │ Stalled on L2 Data     │  54.57 │ 50.07 │ 72.08 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.1  │ Stalled on L2 Req      │   8.99 │  7.98 │ 29.49 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.2  │ Tag RAM Stall (Read)   │   8.89 │  0.00 │  9.42 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.3  │ Tag RAM Stall (Write)  │   0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.4  │ Tag RAM Stall (Atomic) │   0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════╧════════╧═══════╧═══════╧════════╛
+16.3 L1D Cache Accesses
+╒═════════╤═════════════════════╤═══════════╤═══════════╤═══════════╤════════════════╕
+│ Index   │ Metric              │       Avg │       Min │       Max │ Unit           │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪═══════════╪════════════════╡
+│ 16.3.0  │ Total Req           │  27163.36 │     64.00 │  28672.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.1  │ Read Req            │  23280.09 │      0.00 │  24576.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.2  │ Write Req           │      3.25 │      0.00 │     64.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.3  │ Atomic Req          │   3880.02 │      0.00 │   4096.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.4  │ Cache BW            │   2142.88 │   1478.26 │   2299.51 │ Gb/s           │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.5  │ Cache Accesses      │  12610.86 │     16.00 │  13312.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.6  │ Cache Hits          │   4595.14 │      8.00 │   4860.12 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.7  │ Cache Hit Rate      │     37.13 │     36.34 │     50.00 │ Pct            │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.8  │ Invalidate          │      0.09 │      0.00 │      0.09 │ per wave       │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.9  │ L1-L2 BW            │ 513006.17 │ 513006.17 │ 513006.17 │ Bytes per wave │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.10 │ L1-L2 Read          │   7045.31 │      0.00 │   7450.25 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.11 │ L1-L2 Write         │      0.41 │      0.00 │      8.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.12 │ L1-L2 Atomic        │    970.00 │      0.00 │   1024.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.13 │ L1 Access Latency   │    434.68 │    332.95 │    772.70 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.14 │ L1-L2 Read Latency  │    176.77 │    175.94 │    177.80 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.15 │ L1-L2 Write Latency │    516.16 │    360.58 │    529.82 │ Cycles         │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧═══════════╧════════════════╛
+16.4 L1D - L2 Transactions
+╒═════════╤═════════════╤════════╤═════════════╤═════════╤═══════╤═════════╤══════════════╕
+│ Index   │ Metric      │ Xfer   │ Coherency   │     Avg │   Min │     Max │ Unit         │
+╞═════════╪═════════════╪════════╪═════════════╪═════════╪═══════╪═════════╪══════════════╡
+│ 16.4.0  │ NC - Read   │ Read   │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.1  │ UC - Read   │ Read   │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.2  │ CC - Read   │ Read   │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.3  │ RW - Read   │ Read   │ RW          │ 7043.07 │  0.00 │ 7444.33 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.4  │ RW - Write  │ Write  │ RW          │    0.41 │  0.00 │    8.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.5  │ NC - Write  │ Write  │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.6  │ UC - Write  │ Write  │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.7  │ CC - Write  │ Write  │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.8  │ NC - Atomic │ Atomic │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.9  │ UC - Atomic │ Atomic │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.10 │ CC - Atomic │ Atomic │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.11 │ RW - Atomic │ Atomic │ RW          │  970.00 │  0.00 │ 1024.00 │ Req per wave │
+╘═════════╧═════════════╧════════╧═════════════╧═════════╧═══════╧═════════╧══════════════╛
+16.5 L1D Addr Translation
+╒═════════╤══════════════════════╤══════════╤═══════╤══════════╤═════════════════╕
+│ Index   │ Metric               │     Mean │   Min │      Max │ Units           │
+╞═════════╪══════════════════════╪══════════╪═══════╪══════════╪═════════════════╡
+│ 16.5.0  │ Req                  │ 12620.79 │ 16.00 │ 13897.50 │ Req per wave    │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.1  │ Hit Ratio            │    99.86 │ 98.02 │   100.00 │ Pct             │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.2  │ Hits                 │ 12610.44 │ 15.68 │ 13882.20 │ Hits per wave   │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.3  │ Misses (Translation) │     9.39 │  0.00 │    12.92 │ Misses per wave │
+├─────────┼──────────────────────┼──────────┼───────┼──────────┼─────────────────┤
+│ 16.5.4  │ Misses (Permission)  │     0.00 │  0.00 │     0.00 │ Misses per wave │
+╘═════════╧══════════════════════╧══════════╧═══════╧══════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+17. L2 Cache
+17.1 Speed-of-Light
+╒═════════╤═════════════╤═════════╤════════╕
+│ Index   │ Metric      │   Value │ Unit   │
+╞═════════╪═════════════╪═════════╪════════╡
+│ 17.1.0  │ L2 Util     │   95.88 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.1  │ Cache Hit   │   84.47 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.2  │ L2-EA Rd BW │  143.54 │ Gb/s   │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.3  │ L2-EA Wr BW │   83.33 │ Gb/s   │
+╘═════════╧═════════════╧═════════╧════════╛
+17.2 L2 - Fabric Transactions
+╒═════════╤══════════════════════╤══════════╤════════╤══════════╤════════════════╕
+│ Index   │ Metric               │ Avg      │ Min    │ Max      │ Unit           │
+╞═════════╪══════════════════════╪══════════╪════════╪══════════╪════════════════╡
+│ 17.2.0  │ Read BW              │ 56484.29 │ 0.0    │ 59861.15 │ Bytes per wave │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.1  │ Write BW             │ 15795.9  │ 435.22 │ 34440.27 │ Bytes per wave │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.2  │ Read (32B)           │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.3  │ Read (Uncached 32B)  │ 0.5      │ 0.0    │ 0.83     │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.4  │ Read (64B)           │ 882.57   │ 0.0    │ 935.33   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.5  │ HBM Read             │ 881.34   │ 0.0    │ 934.78   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.6  │ Write (32B)          │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.7  │ Write (Uncached 32B) │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.8  │ Write (64B)          │ 246.81   │ 6.8    │ 538.13   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.9  │ HBM Write            │ 242.03   │ 6.8    │ 256.0    │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.10 │ Read Latency         │ 292.11   │ 257.92 │ 1200.67  │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.11 │ Write Latency        │ 232.92   │ 111.1  │ 294.42   │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.12 │ Atomic Latency       │          │        │          │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.13 │ Read Stall           │ 0.0      │ 0.0    │ 0.0      │ Pct            │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.14 │ Write Stall          │ 0.31     │ 0.0    │ 6.82     │ Pct            │
+╘═════════╧══════════════════════╧══════════╧════════╧══════════╧════════════════╛
+17.3 L2 Cache Accesses
+╒═════════╤════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ Metric             │     Avg │   Min │     Max │ Unit            │
+╞═════════╪════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 17.3.0  │ Req                │ 8017.25 │  8.00 │ 8475.15 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.1  │ Streaming Req      │    0.00 │  0.00 │    0.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.2  │ Read Req           │ 7044.57 │  0.00 │ 7445.85 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.3  │ Write Req          │    0.41 │  0.00 │    8.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.4  │ Atomic Req         │  970.00 │  0.00 │ 1024.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.5  │ Probe Req          │    0.00 │  0.00 │    0.03 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.6  │ Hits               │ 7134.45 │  0.00 │ 7544.96 │ Hits per wave   │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.7  │ Misses             │  882.80 │  8.00 │  934.50 │ Misses per wave │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.8  │ Cache Hit          │   84.47 │  0.03 │   89.17 │ Pct             │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.9  │ Writeback          │  242.02 │  6.80 │  256.00 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.10 │ NC Req             │    0.06 │  0.00 │    0.78 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.11 │ UC Req             │    0.26 │  0.00 │    0.56 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.12 │ CC Req             │    0.00 │  0.00 │    0.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.13 │ RW Req             │ 8022.63 │  8.00 │ 8980.70 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.14 │ Writeback (Normal) │  241.82 │  6.80 │  256.00 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.15 │ Writeback (TC Req) │    0.20 │  0.00 │    1.20 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.16 │ Evict (Normal)     │  833.06 │  5.87 │  882.74 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.17 │ Evict (TC Req)     │    0.00 │  0.00 │    0.00 │ per wave        │
+╘═════════╧════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+17.4 L2 - EA Interface Stalls
+╒═════════╤═════════════════════════════╤═════════════════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                      │ Type                │ Transaction   │   Avg │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════════╪═════════════════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 17.4.0  │ Read - Remote Socket Stall  │ Remote Socket Stall │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.1  │ Read - Peer GCD Stall       │ Peer GCD Stall      │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.2  │ Read - HBM Stall            │ HBM Stall           │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.3  │ Write - Remote Socket Stall │ Remote Socket Stall │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.4  │ Write - Peer GCD Stall      │ Peer GCD Stall      │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.5  │ Write - HBM Stall           │ HBM Stall           │ Write         │  0.08 │  0.00 │  1.76 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.6  │ Write - Credit Starvation   │ Credit Starvation   │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+╘═════════╧═════════════════════════════╧═════════════════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+18. L2 Cache (per Channel)
+18.1 Aggregate Stats (All 32 channels)
+╒═════════╤════════════════════════════╤════════╤═══════════╤════════╤═════════╤═════════════════╕
+│ Index   │ Metric                     │ Mean   │ Std Dev   │ Min    │ Max     │ Units           │
+╞═════════╪════════════════════════════╪════════╪═══════════╪════════╪═════════╪═════════════════╡
+│ 18.1.0  │ L2 Cache Hit Rate          │ 84.53  │ 19.71     │ 0.03   │ 89.18   │ Pct             │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.1  │ Req                        │ 250.33 │ 58.5      │ 0.25   │ 264.7   │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.2  │ L1 - L2 Read Req           │ 220.01 │ 51.47     │ 0.0    │ 232.7   │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.3  │ L1 - L2 Write Req          │ 0.01   │ 0.06      │ 0.0    │ 0.25    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.4  │ L1 - L2 Atomic Req         │ 30.31  │ 7.09      │ 0.0    │ 32.0    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.5  │ L2 - EA Read Req           │ 27.47  │ 6.43      │ 0.0    │ 29.25   │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.6  │ L2 - EA Write Req          │ 7.56   │ 1.72      │ 0.21   │ 8.02    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.7  │ L2 - EA Atomic Req         │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.8  │ L2 - EA Read Lat           │ 287.51 │ 155.99    │ 249.3  │ 1256.67 │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.9  │ L2 - EA Write Lat          │ 234.24 │ 11.79     │ 228.36 │ 292.43  │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.10 │ L2 - EA Atomic Lat         │        │           │        │         │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.11 │ L2 - EA Read Stall (IO)    │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.12 │ L2 - EA Read Stall (GMI)   │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.13 │ L2 - EA Read Stall (DRAM)  │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.14 │ L2 - EA Write Stall (IO)   │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.15 │ L2 - EA Write Stall (GMI)  │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.16 │ L2 - EA Write Stall (DRAM) │ 0.01   │ 0.03      │ 0.0    │ 0.22    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.17 │ L2 - EA Write Starve       │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+╘═════════╧════════════════════════════╧════════╧═══════════╧════════╧═════════╧═════════════════╛
+18.2 Channel 0-15
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.2.0   │ 18.2.1   │ 18.2.2   │ 18.2.3   │ 18.2.4   │ 18.2.5   │ 18.2.6   │ 18.2.7   │ 18.2.8   │ 18.2.9   │ 18.2.10   │ 18.2.11   │ 18.2.12   │ 18.2.13   │ 18.2.14   │ 18.2.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 0.0      │ 1.0      │ 2.0      │ 3.0      │ 4.0      │ 5.0      │ 6.0      │ 7.0      │ 8.0      │ 9.0      │ 10.0      │ 11.0      │ 12.0      │ 13.0      │ 14.0      │ 15.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 84.54    │ 84.53    │ 84.52    │ 84.52    │ 84.52    │ 84.53    │ 84.5     │ 84.53    │ 84.52    │ 84.52    │ 84.53     │ 84.53     │ 84.53     │ 84.52     │ 84.53     │ 84.52     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 250.44   │ 250.3    │ 250.35   │ 250.3    │ 250.34   │ 250.3    │ 250.42   │ 250.31   │ 250.36   │ 250.31   │ 250.35    │ 250.31    │ 250.41    │ 250.3     │ 250.35    │ 250.31    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 220.12   │ 219.98   │ 220.02   │ 219.98   │ 220.02   │ 219.98   │ 220.09   │ 219.98   │ 220.04   │ 219.98   │ 220.02    │ 219.98    │ 220.08    │ 219.98    │ 220.02    │ 219.98    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31     │ 30.31     │ 30.31     │ 30.31     │ 30.31     │ 30.31     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 27.45    │ 27.45    │ 27.48    │ 27.47    │ 27.48    │ 27.45    │ 27.58    │ 27.44    │ 27.47    │ 27.46    │ 27.46     │ 27.44     │ 27.47     │ 27.46     │ 27.47     │ 27.46     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56      │ 7.56      │ 7.56      │ 7.56      │ 7.56      │ 7.56      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 256.21   │ 252.14   │ 253.55   │ 252.61   │ 244.75   │ 253.17   │ 259.11   │ 263.43   │ 247.69   │ 252.08   │ 250.41    │ 241.99    │ 253.06    │ 253.48    │ 257.94    │ 264.08    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 231.67   │ 232.18   │ 233.26   │ 235.77   │ 232.45   │ 233.34   │ 237.53   │ 244.48   │ 231.65   │ 234.27   │ 231.17    │ 233.31    │ 234.1     │ 232.4     │ 238.31    │ 243.01    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.0      │ 0.0      │ 0.01     │ 0.01     │ 0.01     │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.0       │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+18.3 Channel 16-31
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.3.0   │ 18.3.1   │ 18.3.2   │ 18.3.3   │ 18.3.4   │ 18.3.5   │ 18.3.6   │ 18.3.7   │ 18.3.8   │ 18.3.9   │ 18.3.10   │ 18.3.11   │ 18.3.12   │ 18.3.13   │ 18.3.14   │ 18.3.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 16.0     │ 17.0     │ 18.0     │ 19.0     │ 20.0     │ 21.0     │ 22.0     │ 23.0     │ 24.0     │ 25.0     │ 26.0      │ 27.0      │ 28.0      │ 29.0      │ 30.0      │ 31.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 84.53    │ 84.52    │ 84.53    │ 84.53    │ 84.52    │ 84.53    │ 84.56    │ 84.53    │ 84.53    │ 84.53    │ 84.55     │ 84.52     │ 84.52     │ 84.52     │ 84.52     │ 84.52     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 250.35   │ 250.3    │ 250.35   │ 250.31   │ 250.35   │ 250.3    │ 250.35   │ 250.31   │ 250.34   │ 250.3    │ 250.35    │ 250.31    │ 250.35    │ 250.3     │ 250.35    │ 250.31    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 220.02   │ 219.98   │ 220.02   │ 219.98   │ 220.02   │ 219.98   │ 220.02   │ 219.98   │ 220.02   │ 219.98   │ 220.02    │ 219.98    │ 220.02    │ 219.98    │ 220.02    │ 219.98    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31    │ 30.31     │ 30.31     │ 30.31     │ 30.31     │ 30.31     │ 30.31     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 27.48    │ 27.45    │ 27.47    │ 27.49    │ 27.47    │ 27.44    │ 27.47    │ 27.43    │ 27.46    │ 27.45    │ 27.45     │ 27.47     │ 27.49     │ 27.46     │ 27.48     │ 27.54     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56     │ 7.56      │ 7.56      │ 7.56      │ 7.56      │ 7.56      │ 7.56      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 249.15   │ 249.93   │ 250.38   │ 241.62   │ 716.44   │ 253.5    │ 258.2    │ 254.54   │ 240.44   │ 250.87   │ 250.23    │ 241.75    │ 253.09    │ 253.68    │ 257.73    │ 255.05    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 228.59   │ 230.66   │ 228.37   │ 231.29   │ 232.72   │ 234.18   │ 240.01   │ 244.98   │ 230.87   │ 227.85   │ 229.77    │ 229.21    │ 233.04    │ 232.43    │ 238.18    │ 244.8     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.0      │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.0      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+
diff --git a/results/analyse_k4.txt b/results/analyse_k4.txt
new file mode 100644
index 00000000000..cb1e1c253cf
--- /dev/null
+++ b/results/analyse_k4.txt
@@ -0,0 +1,967 @@
+
+--------
+Analyze
+--------
+
+
+--------------------------------------------------------------------------------
+0. Top Stat
+╒════╤══════════════════════════════════════════╤═════════╤═════════════╤════════════╤══════════════╤═══════╕
+│    │ KernelName                               │   Count │     Sum(ns) │   Mean(ns) │   Median(ns) │   Pct │
+╞════╪══════════════════════════════════════════╪═════════╪═════════════╪════════════╪══════════════╪═══════╡
+│  0 │ void kernel_gemm_xdlops_v2r4r2_simplifie │   56.00 │ 91357961.00 │ 1631392.16 │   1637282.50 │ 99.88 │
+│    │ d<GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v │         │             │            │              │       │
+│    │ 2r4r2, true, (InMemoryDataOperationEn... │         │             │            │              │       │
+├────┼──────────────────────────────────────────┼─────────┼─────────────┼────────────┼──────────────┼───────┤
+│  1 │ __amd_rocclr_fillBufferAligned.kd        │    3.00 │   106720.00 │   35573.33 │     37280.00 │  0.12 │
+╘════╧══════════════════════════════════════════╧═════════╧═════════════╧════════════╧══════════════╧═══════╛
+
+
+--------------------------------------------------------------------------------
+1. System Info
+╒══════════════════╤══════════════════════════════════════════════════════════════════════╕
+│                  │ Info                                                                 │
+╞══════════════════╪══════════════════════════════════════════════════════════════════════╡
+│ workload_name    │ bin                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ command          │ ./example_splitK_gemm_xdl_fp16 1 2 1 4 3840 4096 4096 4096 4096 4096 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_name        │ t008-004.hpcfund                                                     │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_cpu         │ AMD EPYC 7V13 64-Core Processor                                      │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_distro      │ Rocky Linux 9.1 (Blue Onyx)                                          │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_kernel      │ 5.14.0-162.18.1.el9_1.x86_64                                         │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ host_rocmver     │ 5.7.1-98                                                             │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ date             │ Fri May 24 01:12:00 2024 (CDT)                                       │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ gpu_soc          │ gfx908                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSE            │ 8                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numCU            │ 120                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSIMD          │ 4                                                                    │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ waveSize         │ 64                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWavesPerCU    │ 40                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ maxWorkgroupSize │ 1024                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L1               │ 16                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2               │ 8192                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ sclk             │ 1502                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ mclk             │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_sclk         │ 300                                                                  │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ cur_mclk         │ 1200                                                                 │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ L2Banks          │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ LDSBanks         │ 32                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ name             │ mi100                                                                │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ numSQC           │ 48                                                                   │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ hbmBW            │ 1228.8                                                               │
+├──────────────────┼──────────────────────────────────────────────────────────────────────┤
+│ ip_blocks        │ SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF                                 │
+╘══════════════════╧══════════════════════════════════════════════════════════════════════╛
+
+
+--------------------------------------------------------------------------------
+2. System Speed-of-Light
+2.1 Speed-of-Light
+╒═════════╤═══════════════════════════╤═════════╤══════════════════╤═══════════╤═══════╕
+│ Index   │ Metric                    │ Value   │ Unit             │ Peak      │ PoP   │
+╞═════════╪═══════════════════════════╪═════════╪══════════════════╪═══════════╪═══════╡
+│ 2.1.0   │ VALU FLOPs                │         │ Gflops           │ 23070.72  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.1   │ VALU IOPs                 │         │ Gops             │ 23070.72  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.2   │ MFMA FLOPs (BF16)         │         │ Gflops           │ 92282.88  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.3   │ MFMA FLOPs (F16)          │         │ Gflops           │ 184565.76 │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.4   │ MFMA FLOPs (F32)          │         │ Gflops           │ 46141.44  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.5   │ MFMA FLOPs (F64)          │         │ Gflops           │ 46141.44  │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.6   │ MFMA IOPs (Int8)          │         │ Gops             │ 184565.76 │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.7   │ Active CUs                │ 101.0   │ Cus              │ 120.0     │ 84.17 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.8   │ SALU Util                 │ 0.91    │ Pct              │ 100.0     │ 0.91  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.9   │ VALU Util                 │ 10.52   │ Pct              │ 100.0     │ 10.52 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.10  │ MFMA Util                 │         │ Pct              │ 100.0     │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.11  │ VALU Active Threads/Wave  │ 63.98   │ Threads          │ 64.0      │ 99.97 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.12  │ IPC - Issue               │ 0.54    │ Instr/cycle      │ 5.0       │ 10.89 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.13  │ LDS BW                    │ 2855.64 │ Gb/sec           │ 23070.72  │ 12.38 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.14  │ LDS Bank Conflict         │ 0.1     │ Conflicts/access │ 32.0      │ 0.32  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.15  │ Instr Cache Hit Rate      │ 100.0   │ Pct              │ 100.0     │ 100.0 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.16  │ Instr Cache BW            │ 317.33  │ Gb/s             │ 4614.14   │ 6.88  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.17  │ Scalar L1D Cache Hit Rate │ 99.42   │ Pct              │ 100.0     │ 99.42 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.18  │ Scalar L1D Cache BW       │ 42.03   │ Gb/s             │ 4614.14   │ 0.91  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.19  │ Vector L1D Cache Hit Rate │ 34.32   │ Pct              │ 100.0     │ 34.32 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.20  │ Vector L1D Cache BW       │ 2142.97 │ Gb/s             │ 11535.36  │ 18.58 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.21  │ L2 Cache Hit Rate         │ 83.14   │ Pct              │ 100.0     │ 83.14 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.22  │ L2-Fabric Read BW         │ 169.08  │ Gb/s             │ 1228.8    │ 13.76 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.23  │ L2-Fabric Write BW        │ 116.59  │ Gb/s             │ 1228.8    │ 9.49  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.24  │ L2-Fabric Read Latency    │ 304.86  │ Cycles           │           │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.25  │ L2-Fabric Write Latency   │ 239.12  │ Cycles           │           │       │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.26  │ Wave Occupancy            │ 947.95  │ Wavefronts       │ 4800.0    │ 19.75 │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.27  │ Instr Fetch BW            │ 158.67  │ Gb/s             │ 2307.07   │ 6.88  │
+├─────────┼───────────────────────────┼─────────┼──────────────────┼───────────┼───────┤
+│ 2.1.28  │ Instr Fetch Latency       │ 16.26   │ Cycles           │           │       │
+╘═════════╧═══════════════════════════╧═════════╧══════════════════╧═══════════╧═══════╛
+
+
+--------------------------------------------------------------------------------
+5. Command Processor (CPC/CPF)
+5.1 Command Processor Fetcher
+╒═════════╤════════════════════╤════════════╤══════════╤════════════╤═══════════════╕
+│ Index   │ Metric             │        Avg │      Min │        Max │ Unit          │
+╞═════════╪════════════════════╪════════════╪══════════╪════════════╪═══════════════╡
+│ 5.1.0   │ GPU Busy Cycles    │ 2160445.46 │ 61005.00 │ 2287659.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.1   │ CPF Busy           │ 2160445.46 │ 61005.00 │ 2287659.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.2   │ CPF Util           │     100.00 │   100.00 │     100.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.3   │ CPF Stall          │       0.05 │     0.00 │       0.13 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.4   │ L2Cache Intf Busy  │   13042.93 │  1818.00 │   18146.00 │ Cycles/kernel │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.5   │ L2Cache Intf Util  │       0.76 │     0.15 │       6.29 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.6   │ L2Cache Intf Stall │       0.00 │     0.00 │       0.00 │ Pct           │
+├─────────┼────────────────────┼────────────┼──────────┼────────────┼───────────────┤
+│ 5.1.7   │ UTCL1 Stall        │    1139.22 │     0.00 │    2077.00 │ Cycles/kernel │
+╘═════════╧════════════════════╧════════════╧══════════╧════════════╧═══════════════╛
+5.2 Command Processor Compute
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit   │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪════════╡
+│ 5.2.0   │ GPU Busy Cycles        │ 2160445.46 │ 61005.00 │ 2287659.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.1   │ CPC Busy Cycles        │ 2160445.46 │ 61005.00 │ 2287659.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.2   │ CPC Util               │     100.00 │   100.00 │     100.00 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.3   │ CPC Stall Cycles       │   57671.10 │  7182.00 │   69294.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.4   │ CPC Stall Rate         │       3.17 │     1.65 │      15.01 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.5   │ CPC Packet Decoding    │  263149.39 │  9969.00 │  281071.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.6   │ SPI Intf Busy Cycles   │ 1890376.66 │ 40082.00 │ 2006236.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.7   │ SPI Intf Util          │      86.00 │    52.13 │      87.99 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.8   │ L2Cache Intf Util      │       0.08 │     0.02 │       1.62 │ Pct    │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.9   │ UTCL1 Stall Cycles     │    5113.08 │   588.00 │    6374.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.10  │ UTCL2 Intf Busy Cycles │    5699.53 │   913.00 │    6549.00 │ Cycles │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼────────┤
+│ 5.2.11  │ UTCL2 Intf Util        │       0.45 │     0.18 │       5.50 │ Pct    │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧════════╛
+
+
+--------------------------------------------------------------------------------
+6. Shader Processor Input (SPI)
+6.1 SPI Stats
+╒═════════╤════════════════════════╤═════════════╤═══════════╤═════════════╤════════════╕
+│ Index   │ Metric                 │         Avg │       Min │         Max │ Unit       │
+╞═════════╪════════════════════════╪═════════════╪═══════════╪═════════════╪════════════╡
+│ 6.1.0   │ GPU Busy               │  2160445.46 │  61005.00 │  2287659.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.1   │ CS Busy                │ 17157252.37 │ 391853.00 │ 18171857.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.2   │ SPI Busy               │  2143104.66 │  48203.00 │  2264410.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.3   │ SQ Busy                │ 17147516.41 │ 378659.00 │ 18162341.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.4   │ Dispatched Workgroups  │     2603.39 │   1920.00 │    15360.00 │ Workgroups │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.5   │ Dispatched Wavefronts  │    10413.56 │   7680.00 │    61440.00 │ Wavefronts │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.6   │ Wave Alloc Failed      │  3763964.39 │  15514.00 │  3983350.00 │ Cycles     │
+├─────────┼────────────────────────┼─────────────┼───────────┼─────────────┼────────────┤
+│ 6.1.7   │ Wave Alloc Failed - CS │   207098.81 │  76954.00 │   240649.00 │ Cycles     │
+╘═════════╧════════════════════════╧═════════════╧═══════════╧═════════════╧════════════╛
+6.2 SPI Resource Allocation
+╒═════════╤═════════════════════════════╤══════════════╤══════════╤══════════════╤═════════════╕
+│ Index   │ Metric                      │          Avg │      Min │          Max │ Unit        │
+╞═════════╪═════════════════════════════╪══════════════╪══════════╪══════════════╪═════════════╡
+│ 6.2.0   │ Wave request Failed (CS)    │    207098.81 │ 76954.00 │    240649.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.1   │ CS Stall                    │   3569229.44 │     0.00 │   3788924.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.2   │ CS Stall Rate               │       158.26 │     0.00 │       167.92 │ Pct         │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.3   │ Scratch Stall               │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.4   │ Insufficient SIMD Waveslots │         0.00 │     0.00 │         0.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.5   │ Insufficient SIMD VGPRs     │ 213547644.14 │     0.00 │ 226827066.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.6   │ Insufficient SIMD SGPRs     │         0.00 │     0.00 │         0.00 │ Simd        │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.7   │ Insufficient CU LDS         │  53559224.75 │     0.00 │  56800350.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.8   │ Insufficient CU Barries     │         0.00 │     0.00 │         0.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.9   │ Insufficient Bulky Resource │         0.00 │     0.00 │         0.00 │ Cu          │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.10  │ Reach CU Threadgroups Limit │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.11  │ Reach CU Wave Limit         │         0.00 │     0.00 │         0.00 │ Cycles      │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.12  │ VGPR Writes                 │         4.00 │     4.00 │         4.00 │ Cycles/wave │
+├─────────┼─────────────────────────────┼──────────────┼──────────┼──────────────┼─────────────┤
+│ 6.2.13  │ SGPR Writes                 │         5.95 │     5.00 │         6.00 │ Cycles/wave │
+╘═════════╧═════════════════════════════╧══════════════╧══════════╧══════════════╧═════════════╛
+
+
+--------------------------------------------------------------------------------
+7. Wavefront
+7.1 Wavefront Launch Stats
+╒═════════╤═════════════════════╤═══════════╤═══════════╤════════════╤════════════╕
+│ Index   │ Metric              │       Avg │       Min │        Max │ Unit       │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪════════════╪════════════╡
+│ 7.1.0   │ Grid Size           │ 666467.80 │ 491520.00 │ 3932160.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.1   │ Workgroup Size      │    256.00 │    256.00 │     256.00 │ Work items │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.2   │ Total Wavefronts    │  10413.56 │   7680.00 │   61440.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.3   │ Saved Wavefronts    │      0.00 │      0.00 │       0.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.4   │ Restored Wavefronts │      8.14 │      0.00 │     480.00 │ Wavefronts │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.5   │ VGPRs               │    121.90 │      8.00 │     128.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.6   │ AGPRs               │    121.90 │      8.00 │     128.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.7   │ SGPRs               │     32.00 │     32.00 │      32.00 │ Registers  │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.8   │ LDS Allocation      │  23812.34 │      0.00 │   25088.00 │ Bytes      │
+├─────────┼─────────────────────┼───────────┼───────────┼────────────┼────────────┤
+│ 7.1.9   │ Scratch Allocation  │      0.00 │      0.00 │       0.00 │ Bytes      │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧════════════╧════════════╛
+7.2 Wavefront Runtime Stats
+╒═════════╤════════════════════════╤════════════╤══════════╤════════════╤═════════════════╕
+│ Index   │ Metric                 │        Avg │      Min │        Max │ Unit            │
+╞═════════╪════════════════════════╪════════════╪══════════╪════════════╪═════════════════╡
+│ 7.2.0   │ Kernel Time (Nanosec)  │ 1550248.83 │ 27520.00 │ 1661443.00 │ Ns              │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.1   │ Kernel Time (Cycles)   │ 2160445.46 │ 61005.00 │ 2287659.00 │ Cycle           │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.2   │ Instr/wavefront        │    4068.51 │    46.00 │    4284.00 │ Instr/wavefront │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.3   │ Wave Cycles            │  257910.79 │  1773.61 │  273479.97 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.4   │ Dependency Wait Cycles │  135923.73 │  1103.83 │  143792.40 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.5   │ Issue Wait Cycles      │   99579.56 │    47.36 │  106512.35 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.6   │ Active Cycles          │   21874.94 │   188.01 │   23669.75 │ Cycles per wave │
+├─────────┼────────────────────────┼────────────┼──────────┼────────────┼─────────────────┤
+│ 7.2.7   │ Wavefront Occupancy    │     947.95 │   917.28 │    1713.94 │ Wavefronts      │
+╘═════════╧════════════════════════╧════════════╧══════════╧════════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+10. Compute Units - Instruction Mix
+10.1 Instruction Mix
+╒═════════╤═══════════════╤════════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric        │ Avg    │ Min   │ Max   │ Unit           │
+╞═════════╪═══════════════╪════════╪═══════╪═══════╪════════════════╡
+│ 10.1.0  │ VALU - Vector │        │       │       │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.1  │ VMEM          │        │       │       │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.2  │ LDS           │ 683.39 │ 0.0   │ 720.0 │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.3  │ VALU - MFMA   │        │       │       │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.4  │ SALU          │ 94.53  │ 11.0  │ 99.0  │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.5  │ SMEM          │ 5.1    │ 5.0   │ 7.0   │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.6  │ Branch        │ 29.73  │ 6.0   │ 31.0  │ Instr per wave │
+├─────────┼───────────────┼────────┼───────┼───────┼────────────────┤
+│ 10.1.7  │ GDS           │ 0.0    │ 0.0   │ 0.0   │ Instr per wave │
+╘═════════╧═══════════════╧════════╧═══════╧═══════╧════════════════╛
+10.2 VALU Arithmetic Instr Mix
+╒═════════╤════════════╤═══════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric     │ Avg   │ Min   │ Max   │ Unit           │
+╞═════════╪════════════╪═══════╪═══════╪═══════╪════════════════╡
+│ 10.2.0  │ INT-32     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.1  │ INT-64     │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.2  │ F16-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.3  │ F16-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.4  │ F16-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.5  │ F16-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.6  │ F32-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.7  │ F32-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.8  │ F32-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.9  │ F32-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.10 │ F64-ADD    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.11 │ F64-Mult   │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.12 │ F64-FMA    │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.13 │ F64-Trans  │       │       │       │ Instr per wave │
+├─────────┼────────────┼───────┼───────┼───────┼────────────────┤
+│ 10.2.14 │ Conversion │       │       │       │ Instr per wave │
+╘═════════╧════════════╧═══════╧═══════╧═══════╧════════════════╛
+10.3 VMEM Instr Mix
+╒═════════╤═══════════════╤═════════╕
+│ Index   │ Type          │   Count │
+╞═════════╪═══════════════╪═════════╡
+│ 10.3.0  │ Buffer Instr  │  242.98 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.1  │ Buffer Read   │  182.24 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.2  │ Buffer Write  │    0.00 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.3  │ Buffer Atomic │   60.75 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.4  │ Flat Instr    │    0.05 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.5  │ Flat Read     │    0.00 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.6  │ Flat Write    │    0.05 │
+├─────────┼───────────────┼─────────┤
+│ 10.3.7  │ Flat Atomic   │    0.00 │
+╘═════════╧═══════════════╧═════════╛
+10.4 MFMA Arithmetic Instr Mix
+╒═════════╤═══════════╤═════════╕
+│ Index   │ Type      │ Count   │
+╞═════════╪═══════════╪═════════╡
+│ 10.4.0  │ MFMA-I8   │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.1  │ MFMA-F16  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.2  │ MFMA-BF16 │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.3  │ MFMA-F32  │         │
+├─────────┼───────────┼─────────┤
+│ 10.4.4  │ MFMA-F64  │         │
+╘═════════╧═══════════╧═════════╛
+
+
+--------------------------------------------------------------------------------
+11. Compute Units - Compute Pipeline
+11.1 Speed-of-Light
+╒═════════╤═════════════════════╤═════════╕
+│ Index   │ Metric              │ Value   │
+╞═════════╪═════════════════════╪═════════╡
+│ 11.1.0  │ valu_flops_pop      │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.1  │ mfma_flops_bf16_pop │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.2  │ mfma_flops_f16_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.3  │ mfma_flops_f32_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.4  │ mfma_flops_f64_pop  │         │
+├─────────┼─────────────────────┼─────────┤
+│ 11.1.5  │ mfma_flops_i8_pop   │         │
+╘═════════╧═════════════════════╧═════════╛
+11.2 Pipeline Stats
+╒═════════╤═════════════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric              │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═════════════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.2.0  │ IPC (Avg)           │ 0.14  │ 0.12  │ 0.51  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.1  │ IPC (Issue)         │ 0.54  │ 0.52  │ 0.91  │ Instr/cycle  │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.2  │ SALU Util           │ 0.91  │ 0.29  │ 15.11 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.3  │ VALU Util           │ 10.52 │ 10.24 │ 17.63 │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.4  │ VALU Active Threads │ 63.98 │ 63.43 │ 64.0  │ Threads      │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.5  │ MFMA Util           │       │       │       │ Pct          │
+├─────────┼─────────────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.2.6  │ MFMA Instr Cycles   │       │       │       │ Cycles/instr │
+╘═════════╧═════════════════════╧═══════╧═══════╧═══════╧══════════════╛
+11.3 Arithmetic Operations
+╒═════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric        │ Avg   │ Min   │ Max   │ Unit         │
+╞═════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 11.3.0  │ FLOPs (Total) │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.1  │ INT8 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.2  │ F16 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.3  │ BF16 OPs      │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.4  │ F32 OPs       │       │       │       │ Ops per wave │
+├─────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 11.3.5  │ F64 OPs       │       │       │       │ Ops per wave │
+╘═════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+12. Local Data Share (LDS)
+12.1 Speed-of-Light
+╒═════════╤═════════════════════════╤═════════╤═════════════╕
+│ Index   │ Metric                  │   Value │ Unit        │
+╞═════════╪═════════════════════════╪═════════╪═════════════╡
+│ 12.1.0  │ Utilization             │   14.72 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.1  │ Access Rate             │    8.67 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.2  │ Bandwidth (Pct-of-Peak) │   12.38 │ Pct of peak │
+├─────────┼─────────────────────────┼─────────┼─────────────┤
+│ 12.1.3  │ Bank Conflict Rate      │    0.32 │ Pct of peak │
+╘═════════╧═════════════════════════╧═════════╧═════════════╛
+12.2 LDS Stats
+╒═════════╤══════════════════════╤═══════════╤═══════╤═══════════╤══════════════════╕
+│ Index   │ Metric               │       Avg │   Min │       Max │ Unit             │
+╞═════════╪══════════════════════╪═══════════╪═══════╪═══════════╪══════════════════╡
+│ 12.2.0  │ LDS Instrs           │    683.39 │  0.00 │    720.00 │ Instr per wave   │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.1  │ Bandwidth            │ 606492.34 │  0.00 │ 639368.00 │ Bytes per wave   │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.2  │ Bank Conficts/Access │      0.10 │  0.10 │      0.10 │ Conflicts/access │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.3  │ Index Accesses       │   5224.19 │  0.00 │   5507.06 │ Cycles per wave  │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.4  │ Atomic Cycles        │      0.00 │  0.00 │      0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.5  │ Bank Conflict        │    485.97 │  0.00 │    512.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.6  │ Addr Conflict        │      0.00 │  0.00 │      0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.7  │ Unaligned Stall      │      0.00 │  0.00 │      0.00 │ Cycles per wave  │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.8  │ Mem Violations       │      0.00 │  0.00 │      0.00 │ per wave         │
+├─────────┼──────────────────────┼───────────┼───────┼───────────┼──────────────────┤
+│ 12.2.9  │ LDS Latency          │     75.79 │ 75.75 │     75.83 │ Cycles           │
+╘═════════╧══════════════════════╧═══════════╧═══════╧═══════════╧══════════════════╛
+
+
+--------------------------------------------------------------------------------
+13. Instruction Cache
+13.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 13.1.0  │ Bandwidth │    6.88 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 13.1.1  │ Cache Hit │   99.95 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+13.2 Instruction Cache Accesses
+╒═════════╤═════════════════════════╤════════╤═══════╤════════╤═════════════════╕
+│ Index   │ L1I Metric              │   Mean │   Min │    Max │ Unit            │
+╞═════════╪═════════════════════════╪════════╪═══════╪════════╪═════════════════╡
+│ 13.2.0  │ Req                     │ 880.32 │  9.00 │ 927.00 │ Req per wave    │
+├─────────┼─────────────────────────┼────────┼───────┼────────┼─────────────────┤
+│ 13.2.1  │ Hits                    │ 880.29 │  8.89 │ 927.01 │ Hits per wave   │
+├─────────┼─────────────────────────┼────────┼───────┼────────┼─────────────────┤
+│ 13.2.2  │ Misses - Non Duplicated │   0.00 │  0.00 │   0.01 │ Misses per wave │
+├─────────┼─────────────────────────┼────────┼───────┼────────┼─────────────────┤
+│ 13.2.3  │ Misses - Duplicated     │   0.04 │  0.00 │   2.02 │ Misses per wave │
+├─────────┼─────────────────────────┼────────┼───────┼────────┼─────────────────┤
+│ 13.2.4  │ Cache Hit               │  99.95 │ 98.64 │ 100.00 │ Pct             │
+╘═════════╧═════════════════════════╧════════╧═══════╧════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+14. Scalar L1 Data Cache
+14.1 Speed-of-Light
+╒═════════╤═══════════╤═════════╤═════════════╕
+│ Index   │ Metric    │   Value │ Unit        │
+╞═════════╪═══════════╪═════════╪═════════════╡
+│ 14.1.0  │ Bandwidth │    0.91 │ Pct of peak │
+├─────────┼───────────┼─────────┼─────────────┤
+│ 14.1.1  │ Cache Hit │   91.92 │ Pct of peak │
+╘═════════╧═══════════╧═════════╧═════════════╛
+14.2 Scalar L1D Cache Accesses
+╒═════════╤═════════════════════════╤════════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                  │   Mean │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════╪════════╪═══════╪═══════╪══════════════╡
+│ 14.2.0  │ Req                     │   5.10 │  5.00 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.1  │ Hits                    │   4.70 │  4.43 │  6.94 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.2  │ Misses - Non Duplicated │   0.03 │  0.00 │  0.04 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.3  │ Misses- Duplicated      │   0.38 │  0.06 │  0.53 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.4  │ Cache Hit               │  91.92 │ 88.69 │ 99.13 │ Pct          │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.5  │ Read Req (Total)        │   5.10 │  5.00 │  7.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.6  │ Atomic Req              │   0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.7  │ Read Req (1 DWord)      │   1.05 │  1.00 │  2.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.8  │ Read Req (2 DWord)      │   2.05 │  2.00 │  3.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.9  │ Read Req (4 DWord)      │   1.95 │  1.00 │  2.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.10 │ Read Req (8 DWord)      │   0.05 │  0.00 │  1.00 │ Req per wave │
+├─────────┼─────────────────────────┼────────┼───────┼───────┼──────────────┤
+│ 14.2.11 │ Read Req (16 DWord)     │   0.00 │  0.00 │  0.00 │ Req per wave │
+╘═════════╧═════════════════════════╧════════╧═══════╧═══════╧══════════════╛
+14.3 Scalar L1D Cache - L2 Interface
+╒═════════╤════════════╤════════╤═══════╤═══════╤═════════════════╕
+│ Index   │ Metric     │   Mean │   Min │   Max │ Unit            │
+╞═════════╪════════════╪════════╪═══════╪═══════╪═════════════════╡
+│ 14.3.0  │ Read Req   │   0.03 │  0.00 │  0.04 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.1  │ Write Req  │   0.00 │  0.00 │  0.00 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.2  │ Atomic Req │   0.00 │  0.00 │  0.00 │ Req per wave    │
+├─────────┼────────────┼────────┼───────┼───────┼─────────────────┤
+│ 14.3.3  │ Stall      │   0.00 │  0.00 │  0.00 │ Cycles per wave │
+╘═════════╧════════════╧════════╧═══════╧═══════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+15. Texture Addresser and Texture Data (TA/TD)
+15.1 TA
+╒═════════╤════════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ Metric                 │     Avg │   Min │     Max │ Unit            │
+╞═════════╪════════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 15.1.0  │ TA Busy                │   72.09 │ 29.64 │   74.32 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.1  │ TC2TA Addr Stall       │   47.46 │  0.00 │   50.16 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.2  │ TC2TA Data Stall       │   12.94 │ 10.41 │   15.63 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.3  │ TD2TA Addr Stall       │    0.00 │  0.00 │    0.00 │ Pct             │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.4  │ Total Instructions     │  243.03 │  1.00 │  256.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.5  │ Flat Instr             │    0.05 │  0.00 │    1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.6  │ Flat Read Instr        │    0.00 │  0.00 │    0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.7  │ Flat Write Instr       │    0.05 │  0.00 │    1.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.8  │ Flat Atomic Instr      │    0.00 │  0.00 │    0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.9  │ Buffer Instr           │  242.98 │  0.00 │  256.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.10 │ Buffer Read Instr      │  182.24 │  0.00 │  192.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.11 │ Buffer Write Instr     │    0.00 │  0.00 │    0.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.12 │ Buffer Atomic Instr    │   60.75 │  0.00 │   64.00 │ Instr per wave  │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.13 │ Buffer Total Cylces    │ 3887.73 │  0.00 │ 4096.00 │ Cycles per wave │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.14 │ Buffer Coalesced Read  │    0.00 │  0.00 │    0.00 │ Cycles per wave │
+├─────────┼────────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 15.1.15 │ Buffer Coalesced Write │    0.00 │  0.00 │    0.00 │ Cycles per wave │
+╘═════════╧════════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+15.2 TD
+╒═════════╤═══════════════════╤════════╤═══════╤═══════╤════════════════╕
+│ Index   │ Metric            │ Avg    │ Min   │ Max   │ Unit           │
+╞═════════╪═══════════════════╪════════╪═══════╪═══════╪════════════════╡
+│ 15.2.0  │ TD Busy           │ 77.93  │ 48.41 │ 79.27 │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.1  │ TC2TD Stall       │ 68.66  │ 47.5  │ 70.42 │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.2  │ SPI2TD Stall      │        │       │       │ Pct            │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.3  │ Coalescable Instr │ 0.0    │ 0.0   │ 0.0   │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.4  │ Load Instr        │ 182.24 │ 0.0   │ 192.0 │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.5  │ Store Instr       │ 0.05   │ 0.0   │ 1.0   │ Instr per wave │
+├─────────┼───────────────────┼────────┼───────┼───────┼────────────────┤
+│ 15.2.6  │ Atomic Instr      │ 60.75  │ 0.0   │ 64.0  │ Instr per wave │
+╘═════════╧═══════════════════╧════════╧═══════╧═══════╧════════════════╛
+
+
+--------------------------------------------------------------------------------
+16. Vector L1 Data Cache
+16.1 Speed-of-Light
+╒═════════╤═══════════════════╤═════════╤═════════════╕
+│ Index   │ Metric            │   Value │ Unit        │
+╞═════════╪═══════════════════╪═════════╪═════════════╡
+│ 16.1.0  │ Buffer Coalescing │   25.00 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.1  │ Cache Util        │   80.65 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.2  │ Cache BW          │   18.58 │ Pct of peak │
+├─────────┼───────────────────┼─────────┼─────────────┤
+│ 16.1.3  │ Cache Hit         │   34.32 │ Pct of peak │
+╘═════════╧═══════════════════╧═════════╧═════════════╛
+16.2 L1D Cache Stalls
+╒═════════╤════════════════════════╤════════╤═══════╤═══════╤════════╕
+│ Index   │ Metric                 │   Mean │   Min │   Max │ Unit   │
+╞═════════╪════════════════════════╪════════╪═══════╪═══════╪════════╡
+│ 16.2.0  │ Stalled on L2 Data     │  60.10 │ 59.23 │ 68.76 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.1  │ Stalled on L2 Req      │   9.60 │  8.66 │ 29.47 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.2  │ Tag RAM Stall (Read)   │   8.26 │  0.00 │  8.74 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.3  │ Tag RAM Stall (Write)  │   0.00 │  0.00 │  0.00 │ Pct    │
+├─────────┼────────────────────────┼────────┼───────┼───────┼────────┤
+│ 16.2.4  │ Tag RAM Stall (Atomic) │   0.00 │  0.00 │  0.00 │ Pct    │
+╘═════════╧════════════════════════╧════════╧═══════╧═══════╧════════╛
+16.3 L1D Cache Accesses
+╒═════════╤═════════════════════╤═══════════╤═══════════╤═══════════╤════════════════╕
+│ Index   │ Metric              │       Avg │       Min │       Max │ Unit           │
+╞═════════╪═════════════════════╪═══════════╪═══════════╪═══════════╪════════════════╡
+│ 16.3.0  │ Total Req           │  15554.17 │     64.00 │  16384.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.1  │ Read Req            │  11663.19 │      0.00 │  12288.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.2  │ Write Req           │      3.25 │      0.00 │     64.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.3  │ Atomic Req          │   3887.73 │      0.00 │   4096.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.4  │ Cache BW            │   2142.97 │   1500.82 │   2286.14 │ Gb/s           │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.5  │ Cache Accesses      │   6804.34 │     16.00 │   7168.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.6  │ Cache Hits          │   2278.35 │      8.00 │   2410.47 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.7  │ Cache Hit Rate      │     34.32 │     32.15 │     50.00 │ Pct            │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.8  │ Invalidate          │      0.04 │      0.00 │      0.05 │ per wave       │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.9  │ L1-L2 BW            │ 289663.60 │ 289663.60 │ 289663.60 │ Bytes per wave │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.10 │ L1-L2 Read          │   3553.65 │      0.00 │   3839.55 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.11 │ L1-L2 Write         │      0.41 │      0.00 │      8.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.12 │ L1-L2 Atomic        │    971.93 │      0.00 │   1024.00 │ Req per wave   │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.13 │ L1 Access Latency   │    509.12 │    489.27 │    747.01 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.14 │ L1-L2 Read Latency  │    179.02 │    177.98 │    187.45 │ Cycles         │
+├─────────┼─────────────────────┼───────────┼───────────┼───────────┼────────────────┤
+│ 16.3.15 │ L1-L2 Write Latency │    513.40 │    371.45 │    563.86 │ Cycles         │
+╘═════════╧═════════════════════╧═══════════╧═══════════╧═══════════╧════════════════╛
+16.4 L1D - L2 Transactions
+╒═════════╤═════════════╤════════╤═════════════╤═════════╤═══════╤═════════╤══════════════╕
+│ Index   │ Metric      │ Xfer   │ Coherency   │     Avg │   Min │     Max │ Unit         │
+╞═════════╪═════════════╪════════╪═════════════╪═════════╪═══════╪═════════╪══════════════╡
+│ 16.4.0  │ NC - Read   │ Read   │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.1  │ UC - Read   │ Read   │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.2  │ CC - Read   │ Read   │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.3  │ RW - Read   │ Read   │ RW          │ 3550.99 │  0.00 │ 3866.47 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.4  │ RW - Write  │ Write  │ RW          │    2.27 │  0.00 │  109.86 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.5  │ NC - Write  │ Write  │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.6  │ UC - Write  │ Write  │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.7  │ CC - Write  │ Write  │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.8  │ NC - Atomic │ Atomic │ NC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.9  │ UC - Atomic │ Atomic │ UC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.10 │ CC - Atomic │ Atomic │ CC          │    0.00 │  0.00 │    0.00 │ Req per wave │
+├─────────┼─────────────┼────────┼─────────────┼─────────┼───────┼─────────┼──────────────┤
+│ 16.4.11 │ RW - Atomic │ Atomic │ RW          │  971.93 │  0.00 │ 1024.00 │ Req per wave │
+╘═════════╧═════════════╧════════╧═════════════╧═════════╧═══════╧═════════╧══════════════╛
+16.5 L1D Addr Translation
+╒═════════╤══════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ Metric               │    Mean │   Min │     Max │ Units           │
+╞═════════╪══════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 16.5.0  │ Req                  │ 6804.34 │ 16.00 │ 7168.00 │ Req per wave    │
+├─────────┼──────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 16.5.1  │ Hit Ratio            │   99.82 │ 98.01 │  100.00 │ Pct             │
+├─────────┼──────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 16.5.2  │ Hits                 │ 6796.21 │ 15.68 │ 7161.15 │ Hits per wave   │
+├─────────┼──────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 16.5.3  │ Misses (Translation) │    7.19 │  0.00 │    8.03 │ Misses per wave │
+├─────────┼──────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 16.5.4  │ Misses (Permission)  │    0.00 │  0.00 │    0.00 │ Misses per wave │
+╘═════════╧══════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+
+
+--------------------------------------------------------------------------------
+17. L2 Cache
+17.1 Speed-of-Light
+╒═════════╤═════════════╤═════════╤════════╕
+│ Index   │ Metric      │   Value │ Unit   │
+╞═════════╪═════════════╪═════════╪════════╡
+│ 17.1.0  │ L2 Util     │   96.67 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.1  │ Cache Hit   │   83.14 │ Pct    │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.2  │ L2-EA Rd BW │  169.08 │ Gb/s   │
+├─────────┼─────────────┼─────────┼────────┤
+│ 17.1.3  │ L2-EA Wr BW │  116.59 │ Gb/s   │
+╘═════════╧═════════════╧═════════╧════════╛
+17.2 L2 - Fabric Transactions
+╒═════════╤══════════════════════╤══════════╤════════╤══════════╤════════════════╕
+│ Index   │ Metric               │ Avg      │ Min    │ Max      │ Unit           │
+╞═════════╪══════════════════════╪══════════╪════════╪══════════╪════════════════╡
+│ 17.2.0  │ Read BW              │ 35905.49 │ 0.0    │ 38009.03 │ Bytes per wave │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.1  │ Write BW             │ 15551.11 │ 435.22 │ 16409.29 │ Bytes per wave │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.2  │ Read (32B)           │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.3  │ Read (Uncached 32B)  │ 0.25     │ 0.0    │ 0.38     │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.4  │ Read (64B)           │ 561.02   │ 0.0    │ 593.89   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.5  │ HBM Read             │ 563.95   │ 0.0    │ 714.29   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.6  │ Write (32B)          │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.7  │ Write (Uncached 32B) │ 0.0      │ 0.0    │ 0.0      │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.8  │ Write (64B)          │ 242.99   │ 6.8    │ 256.4    │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.9  │ HBM Write            │ 244.84   │ 6.8    │ 367.83   │ Req per wave   │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.10 │ Read Latency         │ 304.86   │ 268.13 │ 1243.33  │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.11 │ Write Latency        │ 239.12   │ 232.32 │ 361.95   │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.12 │ Atomic Latency       │          │        │          │ Cycles         │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.13 │ Read Stall           │ 0.0      │ 0.0    │ 0.0      │ Pct            │
+├─────────┼──────────────────────┼──────────┼────────┼──────────┼────────────────┤
+│ 17.2.14 │ Write Stall          │ 0.24     │ 0.0    │ 5.4      │ Pct            │
+╘═════════╧══════════════════════╧══════════╧════════╧══════════╧════════════════╛
+17.3 L2 Cache Accesses
+╒═════════╤════════════════════╤═════════╤═══════╤═════════╤═════════════════╕
+│ Index   │ Metric             │     Avg │   Min │     Max │ Unit            │
+╞═════════╪════════════════════╪═════════╪═══════╪═════════╪═════════════════╡
+│ 17.3.0  │ Req                │ 4522.01 │  8.00 │ 4773.01 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.1  │ Streaming Req      │    0.00 │  0.00 │    0.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.2  │ Read Req           │ 3550.94 │  0.00 │ 3746.34 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.3  │ Write Req          │    0.41 │  0.00 │    8.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.4  │ Atomic Req         │  971.93 │  0.00 │ 1024.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.5  │ Probe Req          │    0.00 │  0.00 │    0.02 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.6  │ Hits               │ 3960.64 │  0.00 │ 4180.68 │ Hits per wave   │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.7  │ Misses             │  561.37 │  8.00 │  593.25 │ Misses per wave │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.8  │ Cache Hit          │   83.14 │  0.03 │   87.69 │ Pct             │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.9  │ Writeback          │  242.96 │  6.80 │  256.24 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.10 │ NC Req             │    0.03 │  0.00 │    0.04 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.11 │ UC Req             │    0.13 │  0.00 │    0.20 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.12 │ CC Req             │    0.00 │  0.00 │    0.00 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.13 │ RW Req             │ 4522.25 │  8.00 │ 4770.09 │ Req per wave    │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.14 │ Writeback (Normal) │  243.01 │  6.80 │  311.41 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.15 │ Writeback (TC Req) │    1.15 │  0.00 │   20.14 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.16 │ Evict (Normal)     │  539.14 │  5.87 │  660.63 │ per wave        │
+├─────────┼────────────────────┼─────────┼───────┼─────────┼─────────────────┤
+│ 17.3.17 │ Evict (TC Req)     │    0.00 │  0.00 │    0.00 │ per wave        │
+╘═════════╧════════════════════╧═════════╧═══════╧═════════╧═════════════════╛
+17.4 L2 - EA Interface Stalls
+╒═════════╤═════════════════════════════╤═════════════════════╤═══════════════╤═══════╤═══════╤═══════╤══════════════╕
+│ Index   │ Metric                      │ Type                │ Transaction   │   Avg │   Min │   Max │ Unit         │
+╞═════════╪═════════════════════════════╪═════════════════════╪═══════════════╪═══════╪═══════╪═══════╪══════════════╡
+│ 17.4.0  │ Read - Remote Socket Stall  │ Remote Socket Stall │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.1  │ Read - Peer GCD Stall       │ Peer GCD Stall      │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.2  │ Read - HBM Stall            │ HBM Stall           │ Read          │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.3  │ Write - Remote Socket Stall │ Remote Socket Stall │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.4  │ Write - Peer GCD Stall      │ Peer GCD Stall      │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.5  │ Write - HBM Stall           │ HBM Stall           │ Write         │  1.52 │  0.00 │ 29.64 │ Req per wave │
+├─────────┼─────────────────────────────┼─────────────────────┼───────────────┼───────┼───────┼───────┼──────────────┤
+│ 17.4.6  │ Write - Credit Starvation   │ Credit Starvation   │ Write         │  0.00 │  0.00 │  0.00 │ Req per wave │
+╘═════════╧═════════════════════════════╧═════════════════════╧═══════════════╧═══════╧═══════╧═══════╧══════════════╛
+
+
+--------------------------------------------------------------------------------
+18. L2 Cache (per Channel)
+18.1 Aggregate Stats (All 32 channels)
+╒═════════╤════════════════════════════╤════════╤═══════════╤════════╤═════════╤═════════════════╕
+│ Index   │ Metric                     │ Mean   │ Std Dev   │ Min    │ Max     │ Units           │
+╞═════════╪════════════════════════════╪════════╪═══════════╪════════╪═════════╪═════════════════╡
+│ 18.1.0  │ L2 Cache Hit Rate          │ 83.14  │ 19.39     │ 0.03   │ 87.7    │ Pct             │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.1  │ Req                        │ 141.28 │ 32.92     │ 0.25   │ 149.09  │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.2  │ L1 - L2 Read Req           │ 110.89 │ 25.89     │ 0.0    │ 117.09  │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.3  │ L1 - L2 Write Req          │ 0.01   │ 0.06      │ 0.0    │ 0.25    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.4  │ L1 - L2 Atomic Req         │ 30.37  │ 7.09      │ 0.0    │ 32.0    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.5  │ L2 - EA Read Req           │ 17.54  │ 4.09      │ 0.0    │ 18.53   │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.6  │ L2 - EA Write Req          │ 7.59   │ 1.72      │ 0.21   │ 8.01    │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.7  │ L2 - EA Atomic Req         │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Req per wave    │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.8  │ L2 - EA Read Lat           │ 296.73 │ 157.15    │ 257.42 │ 1241.67 │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.9  │ L2 - EA Write Lat          │ 235.3  │ 6.65      │ 231.47 │ 275.41  │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.10 │ L2 - EA Atomic Lat         │        │           │        │         │ Cycles          │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.11 │ L2 - EA Read Stall (IO)    │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.12 │ L2 - EA Read Stall (GMI)   │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.13 │ L2 - EA Read Stall (DRAM)  │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.14 │ L2 - EA Write Stall (IO)   │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.15 │ L2 - EA Write Stall (GMI)  │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.16 │ L2 - EA Write Stall (DRAM) │ 0.0    │ 0.02      │ 0.0    │ 0.11    │ Cycles per wave │
+├─────────┼────────────────────────────┼────────┼───────────┼────────┼─────────┼─────────────────┤
+│ 18.1.17 │ L2 - EA Write Starve       │ 0.0    │ 0.0       │ 0.0    │ 0.0     │ Cycles per wave │
+╘═════════╧════════════════════════════╧════════╧═══════════╧════════╧═════════╧═════════════════╛
+18.2 Channel 0-15
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.2.0   │ 18.2.1   │ 18.2.2   │ 18.2.3   │ 18.2.4   │ 18.2.5   │ 18.2.6   │ 18.2.7   │ 18.2.8   │ 18.2.9   │ 18.2.10   │ 18.2.11   │ 18.2.12   │ 18.2.13   │ 18.2.14   │ 18.2.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 0.0      │ 1.0      │ 2.0      │ 3.0      │ 4.0      │ 5.0      │ 6.0      │ 7.0      │ 8.0      │ 9.0      │ 10.0      │ 11.0      │ 12.0      │ 13.0      │ 14.0      │ 15.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 83.17    │ 83.14    │ 83.14    │ 83.14    │ 83.14    │ 83.14    │ 83.1     │ 83.14    │ 83.14    │ 83.15    │ 83.11     │ 83.13     │ 83.17     │ 83.14     │ 83.14     │ 83.15     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 141.29   │ 141.26   │ 141.29   │ 141.26   │ 141.29   │ 141.26   │ 141.35   │ 141.26   │ 141.29   │ 141.26   │ 141.33    │ 141.26    │ 141.29    │ 141.26    │ 141.29    │ 141.26    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 110.9    │ 110.88   │ 110.9    │ 110.88   │ 110.9    │ 110.88   │ 110.96   │ 110.88   │ 110.9    │ 110.88   │ 110.95    │ 110.88    │ 110.9     │ 110.88    │ 110.9     │ 110.88    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37     │ 30.37     │ 30.37     │ 30.37     │ 30.37     │ 30.37     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 17.55    │ 17.53    │ 17.54    │ 17.53    │ 17.58    │ 17.53    │ 17.57    │ 17.53    │ 17.58    │ 17.52    │ 17.55     │ 17.54     │ 17.53     │ 17.53     │ 17.53     │ 17.52     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59      │ 7.59      │ 7.59      │ 7.59      │ 7.59      │ 7.59      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 261.32   │ 261.04   │ 260.46   │ 261.37   │ 261.22   │ 262.13   │ 267.31   │ 272.13   │ 251.0    │ 251.59   │ 250.18    │ 259.17    │ 257.31    │ 262.14    │ 267.78    │ 262.89    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 236.06   │ 233.91   │ 233.04   │ 232.77   │ 236.51   │ 233.63   │ 240.1    │ 244.21   │ 233.97   │ 234.04   │ 233.08    │ 233.26    │ 234.03    │ 234.06    │ 238.23    │ 243.73    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.01     │ 0.0      │ 0.0      │ 0.0      │ 0.01     │ 0.0      │ 0.0      │ 0.0      │ 0.01     │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+18.3 Channel 16-31
+╒═════════════════════════════════════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╤═══════════╕
+│                                         │ 18.3.0   │ 18.3.1   │ 18.3.2   │ 18.3.3   │ 18.3.4   │ 18.3.5   │ 18.3.6   │ 18.3.7   │ 18.3.8   │ 18.3.9   │ 18.3.10   │ 18.3.11   │ 18.3.12   │ 18.3.13   │ 18.3.14   │ 18.3.15   │
+╞═════════════════════════════════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
+│ Channel                                 │ 16.0     │ 17.0     │ 18.0     │ 19.0     │ 20.0     │ 21.0     │ 22.0     │ 23.0     │ 24.0     │ 25.0     │ 26.0      │ 27.0      │ 28.0      │ 29.0      │ 30.0      │ 31.0      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2 Cache Hit Rate (%)                   │ 83.15    │ 83.14    │ 83.15    │ 83.15    │ 83.14    │ 83.15    │ 83.13    │ 83.15    │ 83.14    │ 83.18    │ 83.14     │ 83.14     │ 83.14     │ 83.14     │ 83.14     │ 83.14     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ Requests (Requests)                     │ 141.29   │ 141.26   │ 141.29   │ 141.26   │ 141.29   │ 141.26   │ 141.29   │ 141.26   │ 141.3    │ 141.26   │ 141.29    │ 141.26    │ 141.29    │ 141.26    │ 141.29    │ 141.26    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Read (Requests)                   │ 110.9    │ 110.88   │ 110.9    │ 110.88   │ 110.9    │ 110.88   │ 110.9    │ 110.88   │ 110.91   │ 110.88   │ 110.9     │ 110.88    │ 110.9     │ 110.88    │ 110.9     │ 110.88    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Write (Requests)                  │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01     │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │ 0.01      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L1-L2 Atomic (Requests)                 │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37    │ 30.37     │ 30.37     │ 30.37     │ 30.37     │ 30.37     │ 30.37     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read (Requests)                   │ 17.55    │ 17.54    │ 17.53    │ 17.52    │ 17.54    │ 17.53    │ 17.54    │ 17.52    │ 17.53    │ 17.51    │ 17.55     │ 17.53     │ 17.55     │ 17.53     │ 17.54     │ 17.53     │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write (Requests)                  │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59     │ 7.59      │ 7.59      │ 7.59      │ 7.59      │ 7.59      │ 7.59      │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic (Requests)                 │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Latency (Cycles)             │ 248.89   │ 257.92   │ 259.0    │ 254.8    │ 252.78   │ 261.85   │ 266.16   │ 272.19   │ 257.72   │ 258.11   │ 258.36    │ 259.31    │ 261.7     │ 261.89    │ 266.52    │ 275.16    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Latency (Cycles)            │ 230.8    │ 231.7    │ 229.37   │ 230.2    │ 233.53   │ 237.03   │ 241.57   │ 245.82   │ 230.2    │ 229.19   │ 231.21    │ 230.06    │ 234.45    │ 236.19    │ 238.98    │ 244.73    │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Atomic Latency (Cycles)           │          │          │          │          │          │          │          │          │          │          │           │           │           │           │           │           │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - IO (Cycles per)      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - GMI (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Read Stall - DRAM (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - IO (Cycles per)     │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - GMI (Cycles per)    │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - DRAM (Cycles per)   │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.01     │ 0.01     │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+├─────────────────────────────────────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼───────────┼───────────┼───────────┼───────────┼───────────┼───────────┤
+│ L2-EA Write Stall - Starve (Cycles per) │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0      │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │ 0.0       │
+╘═════════════════════════════════════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╧═══════════╛
+
diff --git a/results/analyse_k8.txt b/results/analyse_k8.txt
new file mode 100644
index 00000000000..a4e60f5bd67
--- /dev/null
+++ b/results/analyse_k8.txt
@@ -0,0 +1 @@
+/var/spool/slurmd/job45676/slurm_script: line 13: omniperf: command not found
diff --git a/results/example_splitK_gemm_xdl_fp16 b/results/example_splitK_gemm_xdl_fp16
new file mode 100755
index 00000000000..5365855cabe
Binary files /dev/null and b/results/example_splitK_gemm_xdl_fp16 differ
diff --git a/results/pmc_perf_4.txt b/results/pmc_perf_4.txt
new file mode 100644
index 00000000000..91283cd1b90
--- /dev/null
+++ b/results/pmc_perf_4.txt
@@ -0,0 +1,25 @@
+usage: omniperf [mode] [options]
+
+Command line interface for AMD's GPU profiler, Omniperf
+
+Modes:
+  {profile,database,analyze}  Select mode of interaction with the target application:
+    profile                   Profile the target application
+    database                  Interact with Omniperf database
+    analyze                   Analyze existing profiling results at command line
+
+Help:
+  -h, --help                  show this help message and exit
+
+General Options:
+  -v, --version               show program's version number and exit
+usage: omniperf [mode] [options]
+tool: error: Profling Error: Cannot find pmc_perf.csv in /work1/sadasivan/student51/assignments/finalProject/composable_kernel/build/bin/workloads/bin/mi100/perfmon
+
+--------
+Analyze
+--------
+
+
+
+
diff --git a/results/submit_jobs.sh b/results/submit_jobs.sh
new file mode 100644
index 00000000000..faacaf119cb
--- /dev/null
+++ b/results/submit_jobs.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH --job-name=example_job
+#SBATCH --mail-type=BEGIN,END
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=5:00
+#SBATCH --account=sadasivan
+#SBATCH --partition=devel 
+# or use --partition=devel / mi1004xif required
+ 
+ # Run your code. Example given below.
+# omniperf profile -n bin -- ./example_splitK_gemm_xdl_fp16 1 2 1 1 3840 4096 4096 4096 4096 4096
+omniperf analyze -p workloads/bin/mi100/ &> analyse_k1.txt