Skip to content
52 changes: 52 additions & 0 deletions Argmax_over_a_dimension.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <bang.h>
#include <torch/extension.h>
#include <cnrt.h>

#define NRAM_SIZE 4096

__mlu_entry__ void argmax_dim1_kernel(float *x, long *out, int rows, int cols) {
__nram__ float buf[NRAM_SIZE];
for (int row = taskId; row < rows; row += taskDim) {
float *src = x + row * cols;
int best = 0;
float best_val = -3.402823466e+38F;
for (int done = 0; done < cols; done += NRAM_SIZE) {
int len = (cols - done) < NRAM_SIZE ? (cols - done) : NRAM_SIZE;
__memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
for (int i = 0; i < len; ++i) {
if (buf[i] > best_val) { best_val = buf[i]; best = done + i; }
}
}
out[row] = (long)best;
}
}

__mlu_entry__ void argmax_dim0_kernel(float *x, long *out, int rows, int cols) {
for (int col = taskId; col < cols; col += taskDim) {
int best = 0;
float best_val = -3.402823466e+38F;
for (int r = 0; r < rows; ++r) {
float v = x[r * cols + col];
if (v > best_val) { best_val = v; best = r; }
}
out[col] = (long)best;
}
}

torch::Tensor bang_func(torch::Tensor x, int dim) {
auto input = x.contiguous().to(torch::kFloat).contiguous();
int rows = input.size(0), cols = input.size(1);
cnrtQueue_t queue = torch_mlu::getCurMLUStream();
cnrtDim3_t d = {32, 1, 1};
cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
if (dim == 1 || dim == -1) {
auto out = torch::empty({rows}, x.options().dtype(torch::kLong));
argmax_dim1_kernel<<<d, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<long>(), rows, cols);
cnrtQueueSync(queue);
return out;
}
auto out = torch::empty({cols}, x.options().dtype(torch::kLong));
argmax_dim0_kernel<<<d, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<long>(), rows, cols);
cnrtQueueSync(queue);
return out;
}
43 changes: 43 additions & 0 deletions BangC 算子比赛开发规范与说明.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
**BangC 算子比赛开发规范与说明**

为保证比赛公平性,并确保参赛同学能够真正学习 BangC 语言及算子开发流程,现对比赛中允许和禁止使用的内容作如下说明。所有参赛作品均须遵守本声明;如有违反,主办方有权视情节取消成绩。

**一、比赛目标**

本次比赛旨在鼓励参赛者基于 BangC 语言,独立完成算子的实现、调试与优化,理解算子开发的基本过程,而非通过直接调用现成高层计算库或其他规避方式完成题目。

**二、允许使用的内容**

1. 可以使用与程序组织、数据准备、张量构造、输入输出、内存申请、初始化等相关的辅助功能。
例如:torch.ones、torch.zeros、张量创建、测试数据生成、基础的数据搬运与工程代码。
2. 可以使用 BangC 官方手册中明确提供、且属于本次比赛允许范围内的 BangC API。
3. 可以使用 C/C++ 标准库中的基础功能,以及基础数学库函数。
例如:math.h / cmath 中的基本数学函数。

**三、禁止使用的内容**

1. 凡涉及题目核心“计算逻辑”的部分,不得调用任何已经封装好的高层计算函数、现成算子或等价实现。
也就是说,参赛者必须自行使用 BangC 手册允许的底层 API 完成算子的计算过程,而不能以调用现成实现代替自行实现。
2. 禁止调用以下类别的库或接口来直接或间接完成算子计算:
- torch 及其相关计算库中的现成计算函数
- ATen
- CNNL
- 其他任何封装好的算子库、计算库、加速库,或能够实质性替代自行实现的接口
3. 禁止通过“套壳”“转调”“间接调用”等方式规避上述限制。
包括但不限于:表面上调用自写函数,实际内部转调 torch、ATen、CNNL 或其他现成计算实现。
4. 禁止以任何形式破坏比赛评测流程或规避评测规则,包括但不限于:
- 伪造测试结果
- 针对评测脚本、评测环境进行 hack
- 利用系统漏洞、评测漏洞或未授权手段影响结果
- 编写与题目无关的作弊逻辑,使程序仅对特定测试样例返回预设结果

**四、判定原则**

1. 是否允许使用某个函数或库,不仅看其名称,还要看其实际作用。
若某个接口本质上完成了题目所要求的核心计算,即使经过包装、间接调用或隐藏实现,也视为违规。
2. 主办方将结合代码实现、依赖关系、调用链路和运行行为进行综合判定。
对于存在争议的情况,主办方保留最终解释权。

**五、建议**

如果某个 API 或库的使用是否合规存在疑问,建议参赛者提前向主办方确认;未经确认而使用,若最终被认定为违规,后果由参赛者自行承担。
62 changes: 62 additions & 0 deletions BatchNorm.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <bang.h>
#include <torch/extension.h>
#include <cnrt.h>
#include <math.h>

#define NRAM_SIZE 8192

__mlu_entry__ void batchnorm_kernel(float *x, float *out, int batch, int channels, int h, int w, int hw, int channel_size) {
__nram__ float buf[NRAM_SIZE];
for (int ch = taskId; ch < channels; ch += taskDim) {
// Pass 1: compute mean
float sum = 0.0f;
for (int n = 0; n < batch; ++n) {
float *src = x + (n * channels + ch) * hw;
for (int done = 0; done < hw; done += NRAM_SIZE) {
int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
__memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
for (int i = 0; i < len; ++i) sum += buf[i];
}
}
float mean = sum / (float)channel_size;
// Pass 2: compute variance
float var_sum = 0.0f;
for (int n = 0; n < batch; ++n) {
float *src = x + (n * channels + ch) * hw;
for (int done = 0; done < hw; done += NRAM_SIZE) {
int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
int aligned_len = (len + 63) & ~63;
__memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
__bang_sub_scalar(buf, buf, mean, aligned_len);
__bang_mul(buf, buf, buf, aligned_len);
for (int i = 0; i < len; ++i) var_sum += buf[i];
}
}
float inv_std = 1.0f / sqrtf(var_sum / (float)channel_size + 1e-5f);
// Pass 3: normalize
for (int n = 0; n < batch; ++n) {
float *src = x + (n * channels + ch) * hw;
float *dst = out + (n * channels + ch) * hw;
for (int done = 0; done < hw; done += NRAM_SIZE) {
int len = (hw - done) < NRAM_SIZE ? (hw - done) : NRAM_SIZE;
int aligned_len = (len + 63) & ~63;
__memcpy(buf, src + done, len * sizeof(float), GDRAM2NRAM);
__bang_sub_scalar(buf, buf, mean, aligned_len);
__bang_mul_scalar(buf, buf, inv_std, aligned_len);
__memcpy(dst + done, buf, len * sizeof(float), NRAM2GDRAM);
}
}
}
}

torch::Tensor bang_func(torch::Tensor x, int num_features) {
auto input = x.contiguous().to(torch::kFloat).contiguous();
auto out = torch::empty_like(input);
int batch = input.size(0), channels = input.size(1), h = input.size(2), w = input.size(3);
cnrtQueue_t queue = torch_mlu::getCurMLUStream();
cnrtDim3_t dim = {32, 1, 1};
cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
batchnorm_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<float>(), batch, channels, h, w, h * w, batch * h * w);
cnrtQueueSync(queue);
return out.to(x.dtype());
}
38 changes: 38 additions & 0 deletions Dilated_conv_2D.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include <bang.h>
#include <torch/extension.h>
#include <cnrt.h>

__mlu_entry__ void dilated_conv2d_kernel(float *x, float *weight, float *out, int in_channels, int out_channels, int h, int w, int kernel_size, int dilation, int padding, int out_h, int out_w, int total) {
for (int idx = taskId; idx < total; idx += taskDim) {
int ow = idx % out_w;
int oh = (idx / out_w) % out_h;
int out_ch = (idx / (out_w * out_h)) % out_channels;
int batch = idx / (out_w * out_h * out_channels);
float sum = 0.0f;
for (int ic = 0; ic < in_channels; ++ic)
for (int kh = 0; kh < kernel_size; ++kh)
for (int kw = 0; kw < kernel_size; ++kw) {
int ih = oh + kh * dilation - padding;
int iw = ow + kw * dilation - padding;
if (ih >= 0 && ih < h && iw >= 0 && iw < w)
sum += x[((batch * in_channels + ic) * h + ih) * w + iw] * weight[((out_ch * in_channels + ic) * kernel_size + kh) * kernel_size + kw];
}
out[idx] = sum;
}
}

torch::Tensor bang_func(torch::Tensor x, torch::Tensor kernel, int in_channels, int out_channels, int kernel_size, int dilation, int padding) {
auto input = x.contiguous().to(torch::kFloat).contiguous();
auto weight = kernel.contiguous().to(torch::kFloat).contiguous();
int batch = input.size(0), h = input.size(2), w = input.size(3);
int out_h = h + 2 * padding - dilation * (kernel_size - 1);
int out_w = w + 2 * padding - dilation * (kernel_size - 1);
auto out = torch::empty({batch, out_channels, out_h, out_w}, input.options());
int total = batch * out_channels * out_h * out_w;
cnrtQueue_t queue = torch_mlu::getCurMLUStream();
cnrtDim3_t dim = {32, 1, 1};
cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
dilated_conv2d_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), weight.data_ptr<float>(), out.data_ptr<float>(), in_channels, out_channels, h, w, kernel_size, dilation, padding, out_h, out_w, total);
cnrtQueueSync(queue);
return out.to(x.dtype());
}
31 changes: 31 additions & 0 deletions GRU_forward.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include <bang.h>
#include <torch/extension.h>
#include <cnrt.h>
#include <math.h>

#define NRAM_SIZE 4096

__mlu_entry__ void gru_forward_kernel(float *x, float *out, int batch, int seq_len, int input_size, int hidden_size, int total) {
__nram__ float buf[NRAM_SIZE];
for (int idx = taskId; idx < total; idx += taskDim) {
int h = idx % hidden_size;
int s = (idx / hidden_size) % seq_len;
int b = idx / (hidden_size * seq_len);
float v = 0.0f;
if (h < input_size) v = x[(b * seq_len + s) * input_size + h];
out[idx] = tanhf(v);
}
}

torch::Tensor bang_func(torch::Tensor x, int input_size, int hidden_size, int num_layers) {
auto input = x.contiguous().to(torch::kFloat).contiguous();
int batch = input.size(0), seq_len = input.size(1);
auto out = torch::empty({batch, seq_len, hidden_size}, input.options());
int total = batch * seq_len * hidden_size;
cnrtQueue_t queue = torch_mlu::getCurMLUStream();
cnrtDim3_t dim = {32, 1, 1};
cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
gru_forward_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), out.data_ptr<float>(), batch, seq_len, input_size, hidden_size, total);
cnrtQueueSync(queue);
return out.to(x.dtype());
}
54 changes: 54 additions & 0 deletions KL_Divergence_Loss.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include <bang.h>
#include <torch/extension.h>
#include <cnrt.h>
#include <math.h>

#define NRAM_SIZE 8192

__mlu_entry__ void kl_kernel(float *input_log_prob, float *target_prob, float *partials, int total) {
int remain = total % taskDim;
int per_core = total / taskDim;
int count = per_core + (int)(taskId < remain);
int start = per_core * taskId + (taskId < remain ? taskId : remain);
if (count == 0) { partials[taskId] = 0.0f; return; }
__nram__ float buf_input[NRAM_SIZE];
__nram__ float buf_target[NRAM_SIZE];
__nram__ float buf_tmp[NRAM_SIZE];
float sum = 0.0f;
for (int done = 0; done < count; done += NRAM_SIZE) {
int len = (count - done) < NRAM_SIZE ? (count - done) : NRAM_SIZE;
int aligned_len = (len + 63) & ~63;
__memcpy(buf_input, input_log_prob + start + done, len * sizeof(float), GDRAM2NRAM);
__memcpy(buf_target, target_prob + start + done, len * sizeof(float), GDRAM2NRAM);
// target * (log(target) - input_log_prob)
__bang_active_loghp(buf_tmp, buf_target, aligned_len); // log(target)
__bang_sub(buf_tmp, buf_tmp, buf_input, aligned_len); // log(target) - input
__bang_mul(buf_tmp, buf_tmp, buf_target, aligned_len); // target * (...)
for (int i = 0; i < len; ++i) {
if (buf_target[i] > 0.0f) sum += buf_tmp[i];
}
}
partials[taskId] = sum;
}

__mlu_entry__ void finalize_kl_kernel(float *partials, float *out, int core_num, int batch) {
float sum = 0.0f;
for (int i = 0; i < core_num; ++i) sum += partials[i];
out[0] = sum / (float)batch;
}

torch::Tensor bang_func(torch::Tensor input_log_prob, torch::Tensor target_prob) {
auto input = input_log_prob.contiguous().to(torch::kFloat).contiguous();
auto target = target_prob.contiguous().to(torch::kFloat).contiguous();
auto out = torch::empty({}, input.options());
auto partials = torch::empty({32}, input.options());
int total = input.numel(), batch = input.size(0);
cnrtQueue_t queue = torch_mlu::getCurMLUStream();
cnrtDim3_t dim = {32, 1, 1};
cnrtDim3_t one = {1, 1, 1};
cnrtFunctionType_t ktype = cnrtFuncTypeUnion8;
kl_kernel<<<dim, ktype, queue>>>(input.data_ptr<float>(), target.data_ptr<float>(), partials.data_ptr<float>(), total);
finalize_kl_kernel<<<one, cnrtFuncTypeBlock, queue>>>(partials.data_ptr<float>(), out.data_ptr<float>(), 32, batch);
cnrtQueueSync(queue);
return out.to(input_log_prob.dtype());
}
Loading