kernel-competition-bot · spirit13579 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+# MLU 编译产物
+*.o
+*.so
+*.wrapper.cpp
+
+# Python
+__pycache__/
+*.pyc
+
+.vscode/
+AGENTS.md
+
diff --git a/111_Masked_select.mlu b/111_Masked_select.mlu
@@ -0,0 +1,48 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <framework/core/MLUStream.h>
+#include <cnrt.h>
+
+__mlu_entry__ void masked_select_kernel(
+    const half *input,
+    half *output,
+    int total,
+    float threshold)
+{
+    int write_index = 0;
+    for (int i = 0; i < total; ++i) {
+        half value = input[i];
+        if ((float)value > threshold) {
+            output[write_index] = value;
+            ++write_index;
+        }
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor input, double threshold)
+{
+    TORCH_CHECK(input.is_contiguous(), "Input tensor must be contiguous");
+    TORCH_CHECK(input.dim() == 2, "Input tensor must have shape [M, N]");
+    TORCH_CHECK(input.scalar_type() == torch::kHalf, "111_Masked_select expects float16 input");
+
+    auto mask = input > threshold;
+    int64_t output_size = mask.sum().item<int64_t>();
+    auto output = torch::empty({output_size}, input.options());
+
+    if (output_size == 0) {
+        return output;
+    }
+
+    int total = input.numel();
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {1, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeBlock;
+
+    masked_select_kernel<<<dim, ktype, queue>>>(
+        reinterpret_cast<const half *>(input.data_ptr<at::Half>()),
+        reinterpret_cast<half *>(output.data_ptr<at::Half>()),
+        total,
+        static_cast<float>(threshold));
+
+    return output;
+}
diff --git a/138_GRU_forward.mlu b/138_GRU_forward.mlu
@@ -0,0 +1,71 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <framework/core/MLUStream.h>
+#include <cnrt.h>
+
+__mlu_entry__ void threshold_select_kernel(
+    const float *input,
+    float *output,
+    int total,
+    float threshold)
+{
+    int write_index = 0;
+    for (int i = 0; i < total; ++i) {
+        float value = input[i];
+        if (value > threshold) {
+            output[write_index] = value;
+            ++write_index;
+        }
+    }
+}
+
+__mlu_entry__ void threshold_select_half_kernel(
+    const half *input,
+    half *output,
+    int total,
+    float threshold)
+{
+    int write_index = 0;
+    for (int i = 0; i < total; ++i) {
+        half value = input[i];
+        if ((float)value > threshold) {
+            output[write_index] = value;
+            ++write_index;
+        }
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor input, double threshold)
+{
+    TORCH_CHECK(input.is_contiguous(), "Input tensor must be contiguous");
+    TORCH_CHECK(input.dim() == 2, "Input tensor must have shape [M, N]");
+
+    auto mask = input > threshold;
+    int64_t output_size = mask.sum().item<int64_t>();
+    auto output = torch::empty({output_size}, input.options());
+
+    if (output_size == 0) {
+        return output;
+    }
+
+    int total = input.numel();
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {1, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeBlock;
+
+    if (input.scalar_type() == torch::kHalf) {
+        threshold_select_half_kernel<<<dim, ktype, queue>>>(
+            reinterpret_cast<const half *>(input.data_ptr<at::Half>()),
+            reinterpret_cast<half *>(output.data_ptr<at::Half>()),
+            total,
+            static_cast<float>(threshold));
+    } else {
+        threshold_select_kernel<<<dim, ktype, queue>>>(
+            input.data_ptr<float>(),
+            output.data_ptr<float>(),
+            total,
+            static_cast<float>(threshold));
+    }
+
+    return output;
+}
diff --git a/Adaptive_Max_Pool_2D.mlu b/Adaptive_Max_Pool_2D.mlu
@@ -0,0 +1,115 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+
+#define CHUNK_SIZE 4096
+
+__mlu_entry__ void adaptive_max_pool2d_kernel(
+    float *input,
+    float *output,
+    int batch,
+    int channels,
+    int H,
+    int W,
+    int out_h,
+    int out_w,
+    int total) {
+
+    uint32_t core_id = taskId;
+    uint32_t core_num = taskDim;
+    uint32_t per_core = total / core_num;
+    uint32_t remainder = total % core_num;
+    uint32_t start = core_id * per_core + (core_id < remainder ? core_id : remainder);
+    uint32_t count = per_core + (core_id < remainder ? 1 : 0);
+
+    __nram__ float nram_buf[CHUNK_SIZE];
+
+    for (uint32_t idx = start; idx < start + count; ++idx) {
+        int tmp = idx;
+        int ow = tmp % out_w;
+        tmp /= out_w;
+        int oh = tmp % out_h;
+        tmp /= out_h;
+        int c = tmp % channels;
+        int b = tmp / channels;
+
+        int h_start = (oh * H) / out_h;
+        int h_end = ((oh + 1) * H + out_h - 1) / out_h;
+        int w_start = (ow * W) / out_w;
+        int w_end = ((ow + 1) * W + out_w - 1) / out_w;
+        if (h_end > H) h_end = H;
+        if (w_end > W) w_end = W;
+
+        float max_val;
+        bool first = true;
+
+        for (int h = h_start; h < h_end; ++h) {
+            int row_offset = ((b * channels + c) * H + h) * W + w_start;
+            int row_len = w_end - w_start;
+
+            for (int w_offset = 0; w_offset < row_len; w_offset += CHUNK_SIZE) {
+                int chunk_len = (w_offset + CHUNK_SIZE <= row_len)
+                                    ? CHUNK_SIZE : (row_len - w_offset);
+
+                __memcpy(nram_buf,
+                         input + row_offset + w_offset,
+                         chunk_len * sizeof(float),
+                         GDRAM2NRAM);
+
+                for (int i = 0; i < chunk_len; ++i) {
+                    if (first) {
+                        max_val = nram_buf[i];
+                        first = false;
+                    } else if (nram_buf[i] > max_val) {
+                        max_val = nram_buf[i];
+                    }
+                }
+            }
+        }
+        output[idx] = max_val;
+    }
+}
+
+// 入口函数，必须命名为 bang_func，接受 vector<int64_t> 表示输出尺寸
+torch::Tensor bang_func(
+    torch::Tensor input,
+    std::vector<int64_t> output_size) {
+
+    TORCH_CHECK(input.is_contiguous(), "Input must be contiguous");
+    TORCH_CHECK(output_size.size() == 2, "output_size must have 2 elements");
+
+    int64_t out_h = output_size[0];
+    int64_t out_w = output_size[1];
+
+    auto original_dtype = input.scalar_type();
+    torch::Tensor input_fp32 = input;
+    if (original_dtype != torch::kFloat) {
+        input_fp32 = input.to(torch::kFloat);
+    }
+
+    int batch    = input_fp32.size(0);
+    int channels = input_fp32.size(1);
+    int H        = input_fp32.size(2);
+    int W        = input_fp32.size(3);
+
+    auto output_fp32 = torch::empty({batch, channels, out_h, out_w},
+                                    input_fp32.options());
+    int total = batch * channels * static_cast<int>(out_h) * static_cast<int>(out_w);
+
+    cnrtQueue_t queue = nullptr;
+    cnrtDim3_t dim = {4, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion1;
+
+    adaptive_max_pool2d_kernel<<<dim, ktype, queue>>>(
+        input_fp32.data_ptr<float>(),
+        output_fp32.data_ptr<float>(),
+        batch, channels, H, W,
+        static_cast<int>(out_h),
+        static_cast<int>(out_w),
+        total);
+
+    if (original_dtype != torch::kFloat) {
+        return output_fp32.to(original_dtype);
+    }
+    return output_fp32;
+}
diff --git a/Argmax_over_a_dimension.mlu b/Argmax_over_a_dimension.mlu
@@ -0,0 +1,88 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+#include <limits>
+
+#define BLOCK_SIZE 1024  // 与原示例保持一致，此处未实际使用
+
+/**
+ * @brief 内核：沿指定维度计算 half 张量的最大值索引
+ * @param input       输入张量指针（half）
+ * @param output      输出索引张量指针（int64_t）
+ * @param pre         dim 之前各维度元素总数
+ * @param dim_size    dim 维度长度
+ * @param post        dim 之后各维度元素总数
+ * @param total_output 输出张量元素总数
+ */
+__mlu_entry__ void argmax_kernel(half *input, int64_t *output,
+                                 int pre, int dim_size, int post,
+                                 int total_output) {
+    uint32_t task_id = taskId;
+    uint32_t task_num = taskDim;
+
+    // 每个任务处理多个输出元素（轮询分配）
+    for (int idx = task_id; idx < total_output; idx += task_num) {
+        int pre_idx = idx / post;          // 当前输出在 pre 维度的序号
+        int post_idx = idx % post;         // 当前输出在 post 维度的序号
+        int base = (pre_idx * dim_size * post) + post_idx;  // 输入中对应向量的起始偏移
+
+        float max_val = -std::numeric_limits<float>::infinity();
+        int max_idx = 0;
+
+        // 遍历 dim 维度上的所有元素
+        for (int k = 0; k < dim_size; ++k) {
+            float cur = __half2float(input[base + k * post]);
+            if (cur > max_val) {
+                max_val = cur;
+                max_idx = k;
+            }
+        }
+        output[idx] = max_idx;  // 写入输出索引
+    }
+}
+
+/**
+ * @brief PyTorch 接口函数（与测试框架要求的符号名和参数类型严格一致）
+ * @param x   输入张量，类型 torch::kFloat16，连续内存布局
+ * @param dim 要规约的维度，类型 int（注意不是 int64_t）
+ * @return    输出索引张量，类型 torch::kInt64，形状为移除 dim 后的形状
+ */
+torch::Tensor bang_func(torch::Tensor x, int dim) {
+    TORCH_CHECK(x.is_contiguous(), "Input must be contiguous");
+    TORCH_CHECK(x.scalar_type() == torch::kFloat16, "Input must be float16");
+    int64_t ndim = x.dim();
+    TORCH_CHECK(dim >= 0 && dim < ndim, "Dimension out of range");
+
+    // 计算 pre, dim_size, post
+    int64_t pre = 1, dim_size = x.size(dim), post = 1;
+    for (int64_t i = 0; i < dim; ++i) pre *= x.size(i);
+    for (int64_t i = dim + 1; i < ndim; ++i) post *= x.size(i);
+    int64_t total_output = pre * post;
+
+    // 构造输出形状
+    std::vector<int64_t> out_shape;
+    for (int64_t i = 0; i < ndim; ++i) {
+        if (i != dim) out_shape.push_back(x.size(i));
+    }
+    auto out_opts = torch::TensorOptions().dtype(torch::kInt64).device(x.device());
+    torch::Tensor output = torch::empty(out_shape, out_opts);
+
+    // 获取 MLU 流队列
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+
+    // 设置并行任务数（与原示例类似，使用固定 cluster 数量，每个 cluster 包含多个 task）
+    // 注意：硬件任务数有限，因此这里使用固定数量（例如 16 或 64），每个任务循环处理多个输出
+    // 但为了简单且兼容原示例风格，也可以使用 total_output 个任务（仅适用于小规模测试）
+    // 更健壮的做法是使用固定 cluster 数量 + 轮询。这里选择固定 cluster 数量为 16（与原示例一致）
+    uint32_t cluster_num = 16;          // 与原示例的 dim.x 相同
+    cnrtDim3_t dim3 = {cluster_num, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion1;
+
+    argmax_kernel<<<dim3, ktype, queue>>>(
+        reinterpret_cast<half*>(x.data_ptr<at::Half>()),
+        output.data_ptr<int64_t>(),
+        (int)pre, (int)dim_size, (int)post, (int)total_output
+    );
+
+    return output;
+}