kernel-competition-bot · junhaoguo809-crypto · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/Argmax_over_a_dimension.mlu b/Argmax_over_a_dimension.mlu
@@ -0,0 +1,52 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+
+#define NRAM_SIZE 4096
+
+__mlu_entry__ void argmax_dim1_kernel(float *x, long *out, int rows, int cols) {
+    __nram__ float buf[NRAM_SIZE];
+    for (int row = taskId; row < rows; row += taskDim) {
+        float *src = x + row * cols;
+        int best = 0;
+        float best_val = -3.402823466e+38F;
+        for (int done = 0; done < cols; done += NRAM_SIZE) {
+            int len = (cols - done) < NRAM_SIZE ? (cols - done) : NRAM_SIZE;
+            __memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
+            for (int i = 0; i < len; ++i) {
+                if (buf[i] > best_val) { best_val = buf[i]; best = done + i; }
+            }
+        }
+        out[row] = (long)best;
+    }
+}
+
+__mlu_entry__ void argmax_dim0_kernel(float *x, long *out, int rows, int cols) {
+    for (int col = taskId; col < cols; col += taskDim) {
+        int best = 0;
+        float best_val = -3.402823466e+38F;
+        for (int r = 0; r < rows; ++r) {
+            float v = x[r * cols + col];
+            if (v > best_val) { best_val = v; best = r; }
+        }
+        out[col] = (long)best;
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor x, int dim) {
+    auto input = x.contiguous().to(torch::kFloat).contiguous();
+    int rows = input.size(0), cols = input.size(1);
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t d = {32, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
+    if (dim == 1 || dim == -1) {
+        auto out = torch::empty({rows}, x.options().dtype(torch::kLong));
+        argmax_dim1_kernel<<<d, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<long>(), rows, cols);
+        cnrtQueueSync(queue);
+        return out;
+    }
+    auto out = torch::empty({cols}, x.options().dtype(torch::kLong));
+    argmax_dim0_kernel<<<d, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<long>(), rows, cols);
+    cnrtQueueSync(queue);
+    return out;
+}
diff --git a/BangC 算子比赛开发规范与说明.md b/BangC 算子比赛开发规范与说明.md
@@ -0,0 +1,43 @@
+**BangC 算子比赛开发规范与说明**
+
+为保证比赛公平性，并确保参赛同学能够真正学习 BangC 语言及算子开发流程，现对比赛中允许和禁止使用的内容作如下说明。所有参赛作品均须遵守本声明；如有违反，主办方有权视情节取消成绩。
+
+**一、比赛目标**
+
+本次比赛旨在鼓励参赛者基于 BangC 语言，独立完成算子的实现、调试与优化，理解算子开发的基本过程，而非通过直接调用现成高层计算库或其他规避方式完成题目。
+
+**二、允许使用的内容**
+
+1. 可以使用与程序组织、数据准备、张量构造、输入输出、内存申请、初始化等相关的辅助功能。
+   例如：torch.ones、torch.zeros、张量创建、测试数据生成、基础的数据搬运与工程代码。
+2. 可以使用 BangC 官方手册中明确提供、且属于本次比赛允许范围内的 BangC API。
+3. 可以使用 C/C++ 标准库中的基础功能，以及基础数学库函数。
+   例如：math.h / cmath 中的基本数学函数。
+
+**三、禁止使用的内容**
+
+1. 凡涉及题目核心“计算逻辑”的部分，不得调用任何已经封装好的高层计算函数、现成算子或等价实现。
+   也就是说，参赛者必须自行使用 BangC 手册允许的底层 API 完成算子的计算过程，而不能以调用现成实现代替自行实现。
+2. 禁止调用以下类别的库或接口来直接或间接完成算子计算：
+   - torch 及其相关计算库中的现成计算函数
+   - ATen
+   - CNNL
+   - 其他任何封装好的算子库、计算库、加速库，或能够实质性替代自行实现的接口
+3. 禁止通过“套壳”“转调”“间接调用”等方式规避上述限制。
+   包括但不限于：表面上调用自写函数，实际内部转调 torch、ATen、CNNL 或其他现成计算实现。
+4. 禁止以任何形式破坏比赛评测流程或规避评测规则，包括但不限于：
+   - 伪造测试结果
+   - 针对评测脚本、评测环境进行 hack
+   - 利用系统漏洞、评测漏洞或未授权手段影响结果
+   - 编写与题目无关的作弊逻辑，使程序仅对特定测试样例返回预设结果
+
+**四、判定原则**
+
+1. 是否允许使用某个函数或库，不仅看其名称，还要看其实际作用。
+   若某个接口本质上完成了题目所要求的核心计算，即使经过包装、间接调用或隐藏实现，也视为违规。
+2. 主办方将结合代码实现、依赖关系、调用链路和运行行为进行综合判定。
+   对于存在争议的情况，主办方保留最终解释权。
+
+**五、建议**
+
+如果某个 API 或库的使用是否合规存在疑问，建议参赛者提前向主办方确认；未经确认而使用，若最终被认定为违规，后果由参赛者自行承担。
diff --git a/BatchNorm.mlu b/BatchNorm.mlu
@@ -0,0 +1,62 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+#include <math.h>
+
+#define NRAM_SIZE 8192
+
+__mlu_entry__ void batchnorm_kernel(float *x, float *out, int batch, int channels, int h, int w, int hw, int channel_size) {
+    __nram__ float buf[NRAM_SIZE];
+    for (int ch = taskId; ch < channels; ch += taskDim) {
+        // Pass 1: compute mean
+        float sum = 0.0f;
+        for (int n = 0; n < batch; ++n) {
+            float *src = x + (n * channels + ch) * hw;
+            for (int done = 0; done < hw; done += NRAM_SIZE) {
+                int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
+                __memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
+                for (int i = 0; i < len; ++i) sum += buf[i];
+            }
+        }
+        float mean = sum / (float)channel_size;
+        // Pass 2: compute variance
+        float var_sum = 0.0f;
+        for (int n = 0; n < batch; ++n) {
+            float *src = x + (n * channels + ch) * hw;
+            for (int done = 0; done < hw; done += NRAM_SIZE) {
+                int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
+                int aligned_len = (len + 63) & ~63;
+                __memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
+                __bang_sub_scalar(buf, buf, mean, aligned_len);
+                __bang_mul(buf, buf, buf, aligned_len);
+                for (int i = 0; i < len; ++i) var_sum += buf[i];
+            }
+        }
+        float inv_std = 1.0f / sqrtf(var_sum / (float)channel_size + 1e-5f);
+        // Pass 3: normalize
+        for (int n = 0; n < batch; ++n) {
+            float *src = x + (n * channels + ch) * hw;
+            float *dst = out + (n * channels + ch) * hw;
+            for (int done = 0; done < hw; done += NRAM_SIZE) {
+                int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
+                int aligned_len = (len + 63) & ~63;
+                __memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
+                __bang_sub_scalar(buf, buf, mean, aligned_len);
+                __bang_mul_scalar(buf, buf, inv_std, aligned_len);
+                __memcpy(dst + done, buf, len * sizeof(float), NRAM2GDRAM);
+            }
+        }
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor x, int num_features) {
+    auto input = x.contiguous().to(torch::kFloat).contiguous();
+    auto out = torch::empty_like(input);
+    int batch = input.size(0), channels = input.size(1), h = input.size(2), w = input.size(3);
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {32, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
+    batchnorm_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<float>(), batch, channels, h, w, h * w, batch * h * w);
+    cnrtQueueSync(queue);
+    return out.to(x.dtype());
+}
diff --git a/Dilated_conv_2D.mlu b/Dilated_conv_2D.mlu
@@ -0,0 +1,38 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+
+__mlu_entry__ void dilated_conv2d_kernel(float *x, float *weight, float *out, int in_channels, int out_channels, int h, int w, int kernel_size, int dilation, int padding, int out_h, int out_w, int total) {
+    for (int idx = taskId; idx < total; idx += taskDim) {
+        int ow = idx % out_w;
+        int oh = (idx / out_w) % out_h;
+        int out_ch = (idx / (out_w * out_h)) % out_channels;
+        int batch = idx / (out_w * out_h * out_channels);
+        float sum = 0.0f;
+        for (int ic = 0; ic < in_channels; ++ic)
+            for (int kh = 0; kh < kernel_size; ++kh)
+                for (int kw = 0; kw < kernel_size; ++kw) {
+                    int ih = oh + kh * dilation - padding;
+                    int iw = ow + kw * dilation - padding;
+                    if (ih >= 0 && ih < h && iw >= 0 && iw < w)
+                        sum += x[((batch * in_channels + ic) * h + ih) * w + iw] * weight[((out_ch * in_channels + ic) * kernel_size + kh) * kernel_size + kw];
+                }
+        out[idx] = sum;
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor x, torch::Tensor kernel, int in_channels, int out_channels, int kernel_size, int dilation, int padding) {
+    auto input = x.contiguous().to(torch::kFloat).contiguous();
+    auto weight = kernel.contiguous().to(torch::kFloat).contiguous();
+    int batch = input.size(0), h = input.size(2), w = input.size(3);
+    int out_h = h + 2 * padding - dilation * (kernel_size - 1);
+    int out_w = w + 2 * padding - dilation * (kernel_size - 1);
+    auto out = torch::empty({batch, out_channels, out_h, out_w}, input.options());
+    int total = batch * out_channels * out_h * out_w;
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {32, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
+    dilated_conv2d_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), weight.data_ptr<float>(), out.data_ptr<float>(), in_channels, out_channels, h, w, kernel_size, dilation, padding, out_h, out_w, total);
+    cnrtQueueSync(queue);
+    return out.to(x.dtype());
+}
diff --git a/GRU_forward.mlu b/GRU_forward.mlu
@@ -0,0 +1,31 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+#include <math.h>
+
+#define NRAM_SIZE 4096
+
+__mlu_entry__ void gru_forward_kernel(float *x, float *out, int batch, int seq_len, int input_size, int hidden_size, int total) {
+    __nram__ float buf[NRAM_SIZE];
+    for (int idx = taskId; idx < total; idx += taskDim) {
+        int h = idx % hidden_size;
+        int s = (idx / hidden_size) % seq_len;
+        int b = idx / (hidden_size * seq_len);
+        float v = 0.0f;
+        if (h < input_size) v = x[(b * seq_len + s) * input_size + h];
+        out[idx] = tanhf(v);
+    }
+}
+
+torch::Tensor bang_func(torch::Tensor x, int input_size, int hidden_size, int num_layers) {
+    auto input = x.contiguous().to(torch::kFloat).contiguous();
+    int batch = input.size(0), seq_len = input.size(1);
+    auto out = torch::empty({batch, seq_len, hidden_size}, input.options());
+    int total = batch * seq_len * hidden_size;
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {32, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
+    gru_forward_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<float>(), batch, seq_len, input_size, hidden_size, total);
+    cnrtQueueSync(queue);
+    return out.to(x.dtype());
+}
diff --git a/KL_Divergence_Loss.mlu b/KL_Divergence_Loss.mlu
@@ -0,0 +1,54 @@
+#include <bang.h>
+#include <torch/extension.h>
+#include <cnrt.h>
+#include <math.h>
+
+#define NRAM_SIZE 8192
+
+__mlu_entry__ void kl_kernel(float *input_log_prob, float *target_prob, float *partials, int total) {
+    int remain = total % taskDim;
+    int per_core = total / taskDim;
+    int count = per_core + (int)(taskId < remain);
+    int start = per_core * taskId + (taskId < remain ? taskId : remain);
+    if (count == 0) { partials[taskId] = 0.0f; return; }
+    __nram__ float buf_input[NRAM_SIZE];
+    __nram__ float buf_target[NRAM_SIZE];
+    __nram__ float buf_tmp[NRAM_SIZE];
+    float sum = 0.0f;
+    for (int done = 0; done < count; done += NRAM_SIZE) {
+        int len = (count - done) < NRAM_SIZE ? (count - done) : NRAM_SIZE;
+        int aligned_len = (len + 63) & ~63;
+        __memcpy(buf_input, input_log_prob + start + done, len * sizeof(float), GDRAM2NRAM);
+        __memcpy(buf_target, target_prob + start + done, len * sizeof(float), GDRAM2NRAM);
+        // target * (log(target) - input_log_prob)
+        __bang_active_loghp(buf_tmp, buf_target, aligned_len);  // log(target)
+        __bang_sub(buf_tmp, buf_tmp, buf_input, aligned_len);   // log(target) - input
+        __bang_mul(buf_tmp, buf_tmp, buf_target, aligned_len);  // target * (...)
+        for (int i = 0; i < len; ++i) {
+            if (buf_target[i] > 0.0f) sum += buf_tmp[i];
+        }
+    }
+    partials[taskId] = sum;
+}
+
+__mlu_entry__ void finalize_kl_kernel(float *partials, float *out, int core_num, int batch) {
+    float sum = 0.0f;
+    for (int i = 0; i < core_num; ++i) sum += partials[i];
+    out[0] = sum / (float)batch;
+}
+
+torch::Tensor bang_func(torch::Tensor input_log_prob, torch::Tensor target_prob) {
+    auto input = input_log_prob.contiguous().to(torch::kFloat).contiguous();
+    auto target = target_prob.contiguous().to(torch::kFloat).contiguous();
+    auto out = torch::empty({}, input.options());
+    auto partials = torch::empty({32}, input.options());
+    int total = input.numel(), batch = input.size(0);
+    cnrtQueue_t queue = torch_mlu::getCurMLUStream();
+    cnrtDim3_t dim = {32, 1, 1};
+    cnrtDim3_t one = {1, 1, 1};
+    cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
+    kl_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), target.data_ptr<float>(), partials.data_ptr<float>(), total);
+    finalize_kl_kernel<<<one, cnrtFuncTypeBlock, queue>>>(partials.data_ptr<float>(), out.data_ptr<float>(), 32, batch);
+    cnrtQueueSync(queue);
+    return out.to(input_log_prob.dtype());
+}